Index: head/bin/sh/miscbltin.c =================================================================== --- head/bin/sh/miscbltin.c (revision 296161) +++ head/bin/sh/miscbltin.c (revision 296162) @@ -1,529 +1,532 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Kenneth Almquist. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)miscbltin.c 8.4 (Berkeley) 5/4/95"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); /* * Miscellaneous builtins. */ #include #include #include #include #include #include #include #include #include #include "shell.h" #include "options.h" #include "var.h" #include "output.h" #include "memalloc.h" #include "error.h" #include "mystring.h" #include "syntax.h" #include "trap.h" #undef eflag int readcmd(int, char **); int umaskcmd(int, char **); int ulimitcmd(int, char **); /* * The read builtin. The -r option causes backslashes to be treated like * ordinary characters. * * This uses unbuffered input, which may be avoidable in some cases. * * Note that if IFS=' :' then read x y should work so that: * 'a b' x='a', y='b' * ' a b ' x='a', y='b' * ':b' x='', y='b' * ':' x='', y='' * '::' x='', y='' * ': :' x='', y='' * ':::' x='', y='::' * ':b c:' x='', y='b c:' */ int readcmd(int argc __unused, char **argv __unused) { char **ap; int backslash; char c; int rflag; char *prompt; const char *ifs; char *p; int startword; int status; int i; int is_ifs; int saveall = 0; ptrdiff_t lastnonifs, lastnonifsws; struct timeval tv; char *tvptr; fd_set ifds; ssize_t nread; int sig; rflag = 0; prompt = NULL; tv.tv_sec = -1; tv.tv_usec = 0; while ((i = nextopt("erp:t:")) != '\0') { switch(i) { case 'p': prompt = shoptarg; break; case 'e': break; case 'r': rflag = 1; break; case 't': tv.tv_sec = strtol(shoptarg, &tvptr, 0); if (tvptr == shoptarg) error("timeout value"); switch(*tvptr) { case 0: case 's': break; case 'h': tv.tv_sec *= 60; /* FALLTHROUGH */ case 'm': tv.tv_sec *= 60; break; default: error("timeout unit"); } break; } } if (prompt && isatty(0)) { out2str(prompt); flushall(); } if (*(ap = argptr) == NULL) error("arg count"); if ((ifs = bltinlookup("IFS", 1)) == NULL) ifs = " \t\n"; if (tv.tv_sec >= 0) { /* * Wait for something to become available. */ FD_ZERO(&ifds); FD_SET(0, &ifds); status = select(1, &ifds, NULL, NULL, &tv); /* * If there's nothing ready, return an error. */ if (status <= 0) { sig = pendingsig; return (128 + (sig != 0 ? sig : SIGALRM)); } } status = 0; startword = 2; backslash = 0; STARTSTACKSTR(p); lastnonifs = lastnonifsws = -1; for (;;) { nread = read(STDIN_FILENO, &c, 1); if (nread == -1) { if (errno == EINTR) { sig = pendingsig; if (sig == 0) continue; status = 128 + sig; break; } warning("read error: %s", strerror(errno)); status = 2; break; } else if (nread != 1) { status = 1; break; } if (c == '\0') continue; CHECKSTRSPACE(1, p); if (backslash) { backslash = 0; if (c != '\n') { startword = 0; lastnonifs = lastnonifsws = p - stackblock(); USTPUTC(c, p); } continue; } if (!rflag && c == '\\') { backslash++; continue; } if (c == '\n') break; if (strchr(ifs, c)) is_ifs = strchr(" \t\n", c) ? 1 : 2; else is_ifs = 0; if (startword != 0) { if (is_ifs == 1) { /* Ignore leading IFS whitespace */ if (saveall) USTPUTC(c, p); continue; } if (is_ifs == 2 && startword == 1) { /* Only one non-whitespace IFS per word */ startword = 2; if (saveall) { lastnonifsws = p - stackblock(); USTPUTC(c, p); } continue; } } if (is_ifs == 0) { /* append this character to the current variable */ startword = 0; if (saveall) /* Not just a spare terminator */ saveall++; lastnonifs = lastnonifsws = p - stackblock(); USTPUTC(c, p); continue; } /* end of variable... */ startword = is_ifs; if (ap[1] == NULL) { /* Last variable needs all IFS chars */ saveall++; if (is_ifs == 2) lastnonifsws = p - stackblock(); USTPUTC(c, p); continue; } STACKSTRNUL(p); setvar(*ap, stackblock(), 0); ap++; STARTSTACKSTR(p); lastnonifs = lastnonifsws = -1; } STACKSTRNUL(p); /* * Remove trailing IFS chars: always remove whitespace, don't remove * non-whitespace unless it was naked */ if (saveall <= 1) lastnonifsws = lastnonifs; stackblock()[lastnonifsws + 1] = '\0'; setvar(*ap, stackblock(), 0); /* Set any remaining args to "" */ while (*++ap != NULL) setvar(*ap, "", 0); return status; } int umaskcmd(int argc __unused, char **argv __unused) { char *ap; int mask; int i; int symbolic_mode = 0; while ((i = nextopt("S")) != '\0') { symbolic_mode = 1; } INTOFF; mask = umask(0); umask(mask); INTON; if ((ap = *argptr) == NULL) { if (symbolic_mode) { char u[4], g[4], o[4]; i = 0; if ((mask & S_IRUSR) == 0) u[i++] = 'r'; if ((mask & S_IWUSR) == 0) u[i++] = 'w'; if ((mask & S_IXUSR) == 0) u[i++] = 'x'; u[i] = '\0'; i = 0; if ((mask & S_IRGRP) == 0) g[i++] = 'r'; if ((mask & S_IWGRP) == 0) g[i++] = 'w'; if ((mask & S_IXGRP) == 0) g[i++] = 'x'; g[i] = '\0'; i = 0; if ((mask & S_IROTH) == 0) o[i++] = 'r'; if ((mask & S_IWOTH) == 0) o[i++] = 'w'; if ((mask & S_IXOTH) == 0) o[i++] = 'x'; o[i] = '\0'; out1fmt("u=%s,g=%s,o=%s\n", u, g, o); } else { out1fmt("%.4o\n", mask); } } else { if (is_digit(*ap)) { mask = 0; do { if (*ap >= '8' || *ap < '0') error("Illegal number: %s", *argptr); mask = (mask << 3) + (*ap - '0'); } while (*++ap != '\0'); umask(mask); } else { void *set; INTOFF; if ((set = setmode (ap)) == 0) error("Illegal number: %s", ap); mask = getmode (set, ~mask & 0777); umask(~mask & 0777); free(set); INTON; } } return 0; } /* * ulimit builtin * * This code, originally by Doug Gwyn, Doug Kingston, Eric Gisin, and * Michael Rendell was ripped from pdksh 5.0.8 and hacked for use with * ash by J.T. Conklin. * * Public domain. */ struct limits { const char *name; const char *units; int cmd; int factor; /* multiply by to get rlim_{cur,max} values */ char option; }; static const struct limits limits[] = { #ifdef RLIMIT_CPU { "cpu time", "seconds", RLIMIT_CPU, 1, 't' }, #endif #ifdef RLIMIT_FSIZE { "file size", "512-blocks", RLIMIT_FSIZE, 512, 'f' }, #endif #ifdef RLIMIT_DATA { "data seg size", "kbytes", RLIMIT_DATA, 1024, 'd' }, #endif #ifdef RLIMIT_STACK { "stack size", "kbytes", RLIMIT_STACK, 1024, 's' }, #endif #ifdef RLIMIT_CORE { "core file size", "512-blocks", RLIMIT_CORE, 512, 'c' }, #endif #ifdef RLIMIT_RSS { "max memory size", "kbytes", RLIMIT_RSS, 1024, 'm' }, #endif #ifdef RLIMIT_MEMLOCK { "locked memory", "kbytes", RLIMIT_MEMLOCK, 1024, 'l' }, #endif #ifdef RLIMIT_NPROC { "max user processes", (char *)0, RLIMIT_NPROC, 1, 'u' }, #endif #ifdef RLIMIT_NOFILE { "open files", (char *)0, RLIMIT_NOFILE, 1, 'n' }, #endif #ifdef RLIMIT_VMEM { "virtual mem size", "kbytes", RLIMIT_VMEM, 1024, 'v' }, #endif #ifdef RLIMIT_SWAP { "swap limit", "kbytes", RLIMIT_SWAP, 1024, 'w' }, #endif #ifdef RLIMIT_SBSIZE { "sbsize", "bytes", RLIMIT_SBSIZE, 1, 'b' }, #endif #ifdef RLIMIT_NPTS { "pseudo-terminals", (char *)0, RLIMIT_NPTS, 1, 'p' }, #endif #ifdef RLIMIT_KQUEUES { "kqueues", (char *)0, RLIMIT_KQUEUES, 1, 'k' }, #endif +#ifdef RLIMIT_UMTXP + { "umtxp", (char *)0, RLIMIT_UMTXP, 1, 'o' }, +#endif { (char *) 0, (char *)0, 0, 0, '\0' } }; enum limithow { SOFT = 0x1, HARD = 0x2 }; static void printlimit(enum limithow how, const struct rlimit *limit, const struct limits *l) { rlim_t val = 0; if (how & SOFT) val = limit->rlim_cur; else if (how & HARD) val = limit->rlim_max; if (val == RLIM_INFINITY) out1str("unlimited\n"); else { val /= l->factor; out1fmt("%jd\n", (intmax_t)val); } } int ulimitcmd(int argc __unused, char **argv __unused) { rlim_t val = 0; enum limithow how = SOFT | HARD; const struct limits *l; int set, all = 0; int optc, what; struct rlimit limit; what = 'f'; while ((optc = nextopt("HSatfdsmcnuvlbpwk")) != '\0') switch (optc) { case 'H': how = HARD; break; case 'S': how = SOFT; break; case 'a': all = 1; break; default: what = optc; } for (l = limits; l->name && l->option != what; l++) ; if (!l->name) error("internal error (%c)", what); set = *argptr ? 1 : 0; if (set) { char *p = *argptr; if (all || argptr[1]) error("too many arguments"); if (strcmp(p, "unlimited") == 0) val = RLIM_INFINITY; else { char *end; uintmax_t uval; if (*p < '0' || *p > '9') error("bad number"); errno = 0; uval = strtoumax(p, &end, 10); if (errno != 0 || *end != '\0') error("bad number"); if (uval > UINTMAX_MAX / l->factor) error("bad number"); uval *= l->factor; val = (rlim_t)uval; if (val < 0 || (uintmax_t)val != uval || val == RLIM_INFINITY) error("bad number"); } } if (all) { for (l = limits; l->name; l++) { char optbuf[40]; if (getrlimit(l->cmd, &limit) < 0) error("can't get limit: %s", strerror(errno)); if (l->units) snprintf(optbuf, sizeof(optbuf), "(%s, -%c) ", l->units, l->option); else snprintf(optbuf, sizeof(optbuf), "(-%c) ", l->option); out1fmt("%-18s %18s ", l->name, optbuf); printlimit(how, &limit, l); } return 0; } if (getrlimit(l->cmd, &limit) < 0) error("can't get limit: %s", strerror(errno)); if (set) { if (how & SOFT) limit.rlim_cur = val; if (how & HARD) limit.rlim_max = val; if (setrlimit(l->cmd, &limit) < 0) error("bad limit: %s", strerror(errno)); } else printlimit(how, &limit, l); return 0; } Index: head/include/pthread.h =================================================================== --- head/include/pthread.h (revision 296161) +++ head/include/pthread.h (revision 296162) @@ -1,339 +1,339 @@ /* * Copyright (c) 1993, 1994 by Chris Provenzano, proven@mit.edu * Copyright (c) 1995-1998 by John Birrell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Chris Provenzano. * 4. The name of Chris Provenzano may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY CHRIS PROVENZANO ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL CHRIS PROVENZANO BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PTHREAD_H_ #define _PTHREAD_H_ /* * Header files. */ #include #include #include #include #include #include #include /* * Run-time invariant values: */ #define PTHREAD_DESTRUCTOR_ITERATIONS 4 #define PTHREAD_KEYS_MAX 256 #define PTHREAD_STACK_MIN __MINSIGSTKSZ #define PTHREAD_THREADS_MAX __ULONG_MAX #define PTHREAD_BARRIER_SERIAL_THREAD -1 /* * Flags for threads and thread attributes. */ #define PTHREAD_DETACHED 0x1 #define PTHREAD_SCOPE_SYSTEM 0x2 #define PTHREAD_INHERIT_SCHED 0x4 #define PTHREAD_NOFLOAT 0x8 #define PTHREAD_CREATE_DETACHED PTHREAD_DETACHED #define PTHREAD_CREATE_JOINABLE 0 #define PTHREAD_SCOPE_PROCESS 0 #define PTHREAD_EXPLICIT_SCHED 0 /* - * Flags for read/write lock attributes + * Values for process shared/private attributes. */ #define PTHREAD_PROCESS_PRIVATE 0 #define PTHREAD_PROCESS_SHARED 1 /* * Flags for cancelling threads */ #define PTHREAD_CANCEL_ENABLE 0 #define PTHREAD_CANCEL_DISABLE 1 #define PTHREAD_CANCEL_DEFERRED 0 #define PTHREAD_CANCEL_ASYNCHRONOUS 2 #define PTHREAD_CANCELED ((void *) 1) /* * Flags for once initialization. */ #define PTHREAD_NEEDS_INIT 0 #define PTHREAD_DONE_INIT 1 /* * Static once initialization values. */ #define PTHREAD_ONCE_INIT { PTHREAD_NEEDS_INIT, NULL } /* * Static initialization values. */ #define PTHREAD_MUTEX_INITIALIZER NULL #define PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP ((pthread_mutex_t)1) #define PTHREAD_COND_INITIALIZER NULL #define PTHREAD_RWLOCK_INITIALIZER NULL /* * Default attribute arguments (draft 4, deprecated). */ #ifndef PTHREAD_KERNEL #define pthread_condattr_default NULL #define pthread_mutexattr_default NULL #define pthread_attr_default NULL #endif #define PTHREAD_PRIO_NONE 0 #define PTHREAD_PRIO_INHERIT 1 #define PTHREAD_PRIO_PROTECT 2 /* * Mutex types (Single UNIX Specification, Version 2, 1997). * * Note that a mutex attribute with one of the following types: * * PTHREAD_MUTEX_NORMAL * PTHREAD_MUTEX_RECURSIVE * * will deviate from POSIX specified semantics. */ enum pthread_mutextype { PTHREAD_MUTEX_ERRORCHECK = 1, /* Default POSIX mutex */ PTHREAD_MUTEX_RECURSIVE = 2, /* Recursive mutex */ PTHREAD_MUTEX_NORMAL = 3, /* No error checking */ PTHREAD_MUTEX_ADAPTIVE_NP = 4, /* Adaptive mutex, spins briefly before blocking on lock */ PTHREAD_MUTEX_TYPE_MAX }; #define PTHREAD_MUTEX_DEFAULT PTHREAD_MUTEX_ERRORCHECK struct _pthread_cleanup_info { __uintptr_t pthread_cleanup_pad[8]; }; /* * Thread function prototype definitions: */ __BEGIN_DECLS int pthread_atfork(void (*)(void), void (*)(void), void (*)(void)); int pthread_attr_destroy(pthread_attr_t *) __nonnull(1); int pthread_attr_getstack(const pthread_attr_t * __restrict, void ** __restrict, size_t * __restrict) __nonnull_all; int pthread_attr_getstacksize(const pthread_attr_t *, size_t *) __nonnull_all; int pthread_attr_getguardsize(const pthread_attr_t *, size_t *); int pthread_attr_getstackaddr(const pthread_attr_t *, void **); int pthread_attr_getdetachstate(const pthread_attr_t *, int *) __nonnull_all; int pthread_attr_init(pthread_attr_t *) __nonnull(1); int pthread_attr_setstacksize(pthread_attr_t *, size_t) __nonnull(1); int pthread_attr_setguardsize(pthread_attr_t *, size_t) __nonnull(1); int pthread_attr_setstack(pthread_attr_t *, void *, size_t) __nonnull(1); int pthread_attr_setstackaddr(pthread_attr_t *, void *); int pthread_attr_setdetachstate(pthread_attr_t *, int) __nonnull(1); int pthread_barrier_destroy(pthread_barrier_t *); int pthread_barrier_init(pthread_barrier_t *, const pthread_barrierattr_t *, unsigned); int pthread_barrier_wait(pthread_barrier_t *); int pthread_barrierattr_destroy(pthread_barrierattr_t *); int pthread_barrierattr_getpshared(const pthread_barrierattr_t *, int *); int pthread_barrierattr_init(pthread_barrierattr_t *) __nonnull(1); int pthread_barrierattr_setpshared(pthread_barrierattr_t *, int); #define pthread_cleanup_push(cleanup_routine, cleanup_arg) \ { \ struct _pthread_cleanup_info __cleanup_info__; \ __pthread_cleanup_push_imp(cleanup_routine, cleanup_arg,\ &__cleanup_info__); \ { #define pthread_cleanup_pop(execute) \ (void)0; \ } \ __pthread_cleanup_pop_imp(execute); \ } int pthread_condattr_destroy(pthread_condattr_t *) __nonnull(1); int pthread_condattr_getclock(const pthread_condattr_t *, clockid_t *) __nonnull_all; int pthread_condattr_getpshared(const pthread_condattr_t *, int *) __nonnull_all; int pthread_condattr_init(pthread_condattr_t *) __nonnull(1); int pthread_condattr_setclock(pthread_condattr_t *, clockid_t) __nonnull(1); int pthread_condattr_setpshared(pthread_condattr_t *, int) __nonnull(1); int pthread_cond_broadcast(pthread_cond_t *) __nonnull(1); int pthread_cond_destroy(pthread_cond_t *) __nonnull(1); int pthread_cond_init(pthread_cond_t *, const pthread_condattr_t *) __nonnull(1); int pthread_cond_signal(pthread_cond_t *) __nonnull(1); int pthread_cond_timedwait(pthread_cond_t *, pthread_mutex_t *__mutex, const struct timespec *) __nonnull_all __requires_exclusive(*__mutex); int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *__mutex) __nonnull_all __requires_exclusive(*__mutex); int pthread_create(pthread_t *, const pthread_attr_t *, void *(*) (void *), void *) __nonnull(1) __nonnull(3); int pthread_detach(pthread_t); int pthread_equal(pthread_t, pthread_t); void pthread_exit(void *) __dead2; void *pthread_getspecific(pthread_key_t); int pthread_getcpuclockid(pthread_t, clockid_t *) __nonnull(2); int pthread_join(pthread_t, void **); int pthread_key_create(pthread_key_t *, void (*) (void *)) __nonnull(1); int pthread_key_delete(pthread_key_t); int pthread_mutexattr_init(pthread_mutexattr_t *) __nonnull(1); int pthread_mutexattr_destroy(pthread_mutexattr_t *) __nonnull(1); int pthread_mutexattr_getpshared(const pthread_mutexattr_t *, int *) __nonnull_all; int pthread_mutexattr_gettype(pthread_mutexattr_t *, int *) __nonnull_all; int pthread_mutexattr_settype(pthread_mutexattr_t *, int) __nonnull(1); int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int) __nonnull(1); int pthread_mutex_destroy(pthread_mutex_t *__mutex) __nonnull(1) __requires_unlocked(*__mutex); int pthread_mutex_init(pthread_mutex_t *__mutex, const pthread_mutexattr_t *) __nonnull(1) __requires_unlocked(*__mutex); int pthread_mutex_lock(pthread_mutex_t *__mutex) __nonnull(1) __locks_exclusive(*__mutex); int pthread_mutex_trylock(pthread_mutex_t *__mutex) __nonnull(1) __trylocks_exclusive(0, *__mutex); int pthread_mutex_timedlock(pthread_mutex_t *__mutex, const struct timespec *) __nonnull_all __trylocks_exclusive(0, *__mutex); int pthread_mutex_unlock(pthread_mutex_t *__mutex) __nonnull(1) __unlocks(*__mutex); int pthread_once(pthread_once_t *, void (*) (void)) __nonnull_all; int pthread_rwlock_destroy(pthread_rwlock_t *__rwlock) __nonnull(1) __requires_unlocked(*__rwlock); int pthread_rwlock_init(pthread_rwlock_t *__rwlock, const pthread_rwlockattr_t *) __nonnull(1) __requires_unlocked(*__rwlock); int pthread_rwlock_rdlock(pthread_rwlock_t *__rwlock) __nonnull(1) __locks_shared(*__rwlock); int pthread_rwlock_timedrdlock(pthread_rwlock_t *__rwlock, const struct timespec *) __nonnull_all __trylocks_shared(0, *__rwlock); int pthread_rwlock_timedwrlock(pthread_rwlock_t *__rwlock, const struct timespec *) __nonnull_all __trylocks_exclusive(0, *__rwlock); int pthread_rwlock_tryrdlock(pthread_rwlock_t *__rwlock) __nonnull(1) __trylocks_shared(0, *__rwlock); int pthread_rwlock_trywrlock(pthread_rwlock_t *__rwlock) __nonnull(1) __trylocks_exclusive(0, *__rwlock); int pthread_rwlock_unlock(pthread_rwlock_t *__rwlock) __nonnull(1) __unlocks(*__rwlock); int pthread_rwlock_wrlock(pthread_rwlock_t *__rwlock) __nonnull(1) __locks_exclusive(*__rwlock); int pthread_rwlockattr_destroy(pthread_rwlockattr_t *) __nonnull(1); int pthread_rwlockattr_getkind_np(const pthread_rwlockattr_t *, int *); int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *) __nonnull_all; int pthread_rwlockattr_init(pthread_rwlockattr_t *) __nonnull(1); int pthread_rwlockattr_setkind_np(pthread_rwlockattr_t *, int); int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int) __nonnull(1); pthread_t pthread_self(void); int pthread_setspecific(pthread_key_t, const void *); int pthread_spin_init(pthread_spinlock_t *__spin, int) __requires_unlocked(*__spin); int pthread_spin_destroy(pthread_spinlock_t *__spin) __requires_unlocked(*__spin); int pthread_spin_lock(pthread_spinlock_t *__spin) __locks_exclusive(*__spin); int pthread_spin_trylock(pthread_spinlock_t *__spin) __trylocks_exclusive(0, *__spin); int pthread_spin_unlock(pthread_spinlock_t *__spin) __unlocks(*__spin); int pthread_cancel(pthread_t); int pthread_setcancelstate(int, int *); int pthread_setcanceltype(int, int *); void pthread_testcancel(void); #if __BSD_VISIBLE int pthread_getprio(pthread_t); int pthread_setprio(pthread_t, int); void pthread_yield(void); #endif int pthread_mutexattr_getprioceiling(pthread_mutexattr_t *, int *); int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *, int); int pthread_mutex_getprioceiling(pthread_mutex_t *, int *); int pthread_mutex_setprioceiling(pthread_mutex_t *, int, int *); int pthread_mutexattr_getprotocol(pthread_mutexattr_t *, int *); int pthread_mutexattr_setprotocol(pthread_mutexattr_t *, int); int pthread_attr_getinheritsched(const pthread_attr_t *, int *); int pthread_attr_getschedparam(const pthread_attr_t *, struct sched_param *) __nonnull_all; int pthread_attr_getschedpolicy(const pthread_attr_t *, int *) __nonnull_all; int pthread_attr_getscope(const pthread_attr_t *, int *) __nonnull_all; int pthread_attr_setinheritsched(pthread_attr_t *, int); int pthread_attr_setschedparam(pthread_attr_t *, const struct sched_param *) __nonnull(1) __nonnull(2); int pthread_attr_setschedpolicy(pthread_attr_t *, int) __nonnull(1); int pthread_attr_setscope(pthread_attr_t *, int) __nonnull(1); int pthread_getschedparam(pthread_t pthread, int *, struct sched_param *) __nonnull(2) __nonnull(3); int pthread_setschedparam(pthread_t, int, const struct sched_param *) __nonnull(3); #if __XSI_VISIBLE int pthread_getconcurrency(void); int pthread_setconcurrency(int); #endif void __pthread_cleanup_push_imp(void (*)(void *), void *, struct _pthread_cleanup_info *); void __pthread_cleanup_pop_imp(int); __END_DECLS #endif Index: head/include/unistd.h =================================================================== --- head/include/unistd.h (revision 296161) +++ head/include/unistd.h (revision 296162) @@ -1,592 +1,592 @@ /*- * Copyright (c) 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)unistd.h 8.12 (Berkeley) 4/27/95 * $FreeBSD$ */ #ifndef _UNISTD_H_ #define _UNISTD_H_ #include #include /* XXX adds too much pollution. */ #include #include #include #ifndef _GID_T_DECLARED typedef __gid_t gid_t; #define _GID_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif #ifndef _SIZE_T_DECLARED typedef __size_t size_t; #define _SIZE_T_DECLARED #endif #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #ifndef _USECONDS_T_DECLARED typedef __useconds_t useconds_t; #define _USECONDS_T_DECLARED #endif #define STDIN_FILENO 0 /* standard input file descriptor */ #define STDOUT_FILENO 1 /* standard output file descriptor */ #define STDERR_FILENO 2 /* standard error file descriptor */ #if __XSI_VISIBLE || __POSIX_VISIBLE >= 200112 #define F_ULOCK 0 /* unlock locked section */ #define F_LOCK 1 /* lock a section for exclusive use */ #define F_TLOCK 2 /* test and lock a section for exclusive use */ #define F_TEST 3 /* test a section for locks by other procs */ #endif /* * POSIX options and option groups we unconditionally do or don't * implement. This list includes those options which are exclusively * implemented (or not) in user mode. Please keep this list in * alphabetical order. * * Anything which is defined as zero below **must** have an * implementation for the corresponding sysconf() which is able to * determine conclusively whether or not the feature is supported. * Anything which is defined as other than -1 below **must** have * complete headers, types, and function declarations as specified by * the POSIX standard; however, if the relevant sysconf() function * returns -1, the functions may be stubbed out. */ #define _POSIX_BARRIERS 200112L #define _POSIX_CPUTIME 200112L #define _POSIX_READER_WRITER_LOCKS 200112L #define _POSIX_REGEXP 1 #define _POSIX_SHELL 1 #define _POSIX_SPAWN 200112L #define _POSIX_SPIN_LOCKS 200112L #define _POSIX_THREAD_ATTR_STACKADDR 200112L #define _POSIX_THREAD_ATTR_STACKSIZE 200112L #define _POSIX_THREAD_CPUTIME 200112L #define _POSIX_THREAD_PRIO_INHERIT 200112L #define _POSIX_THREAD_PRIO_PROTECT 200112L #define _POSIX_THREAD_PRIORITY_SCHEDULING 200112L -#define _POSIX_THREAD_PROCESS_SHARED -1 +#define _POSIX_THREAD_PROCESS_SHARED 200112L #define _POSIX_THREAD_SAFE_FUNCTIONS -1 #define _POSIX_THREAD_SPORADIC_SERVER -1 #define _POSIX_THREADS 200112L #define _POSIX_TRACE -1 #define _POSIX_TRACE_EVENT_FILTER -1 #define _POSIX_TRACE_INHERIT -1 #define _POSIX_TRACE_LOG -1 #define _POSIX2_C_BIND 200112L /* mandatory */ #define _POSIX2_C_DEV -1 /* need c99 utility */ #define _POSIX2_CHAR_TERM 1 #define _POSIX2_FORT_DEV -1 /* need fort77 utility */ #define _POSIX2_FORT_RUN 200112L #define _POSIX2_LOCALEDEF -1 #define _POSIX2_PBS -1 #define _POSIX2_PBS_ACCOUNTING -1 #define _POSIX2_PBS_CHECKPOINT -1 #define _POSIX2_PBS_LOCATE -1 #define _POSIX2_PBS_MESSAGE -1 #define _POSIX2_PBS_TRACK -1 #define _POSIX2_SW_DEV -1 /* XXX ??? */ #define _POSIX2_UPE 200112L #define _V6_ILP32_OFF32 -1 #define _V6_ILP32_OFFBIG 0 #define _V6_LP64_OFF64 0 #define _V6_LPBIG_OFFBIG -1 #if __XSI_VISIBLE #define _XOPEN_CRYPT -1 /* XXX ??? */ #define _XOPEN_ENH_I18N -1 /* mandatory in XSI */ #define _XOPEN_LEGACY -1 #define _XOPEN_REALTIME -1 #define _XOPEN_REALTIME_THREADS -1 #define _XOPEN_UNIX -1 #endif /* Define the POSIX.2 version we target for compliance. */ #define _POSIX2_VERSION 199212L /* * POSIX-style system configuration variable accessors (for the * sysconf function). The kernel does not directly implement the * sysconf() interface; rather, a C library stub translates references * to sysconf() into calls to sysctl() using a giant switch statement. * Those that are marked `user' are implemented entirely in the C * library and never query the kernel. pathconf() is implemented * directly by the kernel so those are not defined here. */ #define _SC_ARG_MAX 1 #define _SC_CHILD_MAX 2 #define _SC_CLK_TCK 3 #define _SC_NGROUPS_MAX 4 #define _SC_OPEN_MAX 5 #define _SC_JOB_CONTROL 6 #define _SC_SAVED_IDS 7 #define _SC_VERSION 8 #define _SC_BC_BASE_MAX 9 /* user */ #define _SC_BC_DIM_MAX 10 /* user */ #define _SC_BC_SCALE_MAX 11 /* user */ #define _SC_BC_STRING_MAX 12 /* user */ #define _SC_COLL_WEIGHTS_MAX 13 /* user */ #define _SC_EXPR_NEST_MAX 14 /* user */ #define _SC_LINE_MAX 15 /* user */ #define _SC_RE_DUP_MAX 16 /* user */ #define _SC_2_VERSION 17 /* user */ #define _SC_2_C_BIND 18 /* user */ #define _SC_2_C_DEV 19 /* user */ #define _SC_2_CHAR_TERM 20 /* user */ #define _SC_2_FORT_DEV 21 /* user */ #define _SC_2_FORT_RUN 22 /* user */ #define _SC_2_LOCALEDEF 23 /* user */ #define _SC_2_SW_DEV 24 /* user */ #define _SC_2_UPE 25 /* user */ #define _SC_STREAM_MAX 26 /* user */ #define _SC_TZNAME_MAX 27 /* user */ #if __POSIX_VISIBLE >= 199309 #define _SC_ASYNCHRONOUS_IO 28 #define _SC_MAPPED_FILES 29 #define _SC_MEMLOCK 30 #define _SC_MEMLOCK_RANGE 31 #define _SC_MEMORY_PROTECTION 32 #define _SC_MESSAGE_PASSING 33 #define _SC_PRIORITIZED_IO 34 #define _SC_PRIORITY_SCHEDULING 35 #define _SC_REALTIME_SIGNALS 36 #define _SC_SEMAPHORES 37 #define _SC_FSYNC 38 #define _SC_SHARED_MEMORY_OBJECTS 39 #define _SC_SYNCHRONIZED_IO 40 #define _SC_TIMERS 41 #define _SC_AIO_LISTIO_MAX 42 #define _SC_AIO_MAX 43 #define _SC_AIO_PRIO_DELTA_MAX 44 #define _SC_DELAYTIMER_MAX 45 #define _SC_MQ_OPEN_MAX 46 #define _SC_PAGESIZE 47 #define _SC_RTSIG_MAX 48 #define _SC_SEM_NSEMS_MAX 49 #define _SC_SEM_VALUE_MAX 50 #define _SC_SIGQUEUE_MAX 51 #define _SC_TIMER_MAX 52 #endif #if __POSIX_VISIBLE >= 200112 #define _SC_2_PBS 59 /* user */ #define _SC_2_PBS_ACCOUNTING 60 /* user */ #define _SC_2_PBS_CHECKPOINT 61 /* user */ #define _SC_2_PBS_LOCATE 62 /* user */ #define _SC_2_PBS_MESSAGE 63 /* user */ #define _SC_2_PBS_TRACK 64 /* user */ #define _SC_ADVISORY_INFO 65 #define _SC_BARRIERS 66 /* user */ #define _SC_CLOCK_SELECTION 67 #define _SC_CPUTIME 68 #define _SC_FILE_LOCKING 69 #define _SC_GETGR_R_SIZE_MAX 70 /* user */ #define _SC_GETPW_R_SIZE_MAX 71 /* user */ #define _SC_HOST_NAME_MAX 72 #define _SC_LOGIN_NAME_MAX 73 #define _SC_MONOTONIC_CLOCK 74 #define _SC_MQ_PRIO_MAX 75 #define _SC_READER_WRITER_LOCKS 76 /* user */ #define _SC_REGEXP 77 /* user */ #define _SC_SHELL 78 /* user */ #define _SC_SPAWN 79 /* user */ #define _SC_SPIN_LOCKS 80 /* user */ #define _SC_SPORADIC_SERVER 81 #define _SC_THREAD_ATTR_STACKADDR 82 /* user */ #define _SC_THREAD_ATTR_STACKSIZE 83 /* user */ #define _SC_THREAD_CPUTIME 84 /* user */ #define _SC_THREAD_DESTRUCTOR_ITERATIONS 85 /* user */ #define _SC_THREAD_KEYS_MAX 86 /* user */ #define _SC_THREAD_PRIO_INHERIT 87 /* user */ #define _SC_THREAD_PRIO_PROTECT 88 /* user */ #define _SC_THREAD_PRIORITY_SCHEDULING 89 /* user */ #define _SC_THREAD_PROCESS_SHARED 90 /* user */ #define _SC_THREAD_SAFE_FUNCTIONS 91 /* user */ #define _SC_THREAD_SPORADIC_SERVER 92 /* user */ #define _SC_THREAD_STACK_MIN 93 /* user */ #define _SC_THREAD_THREADS_MAX 94 /* user */ #define _SC_TIMEOUTS 95 /* user */ #define _SC_THREADS 96 /* user */ #define _SC_TRACE 97 /* user */ #define _SC_TRACE_EVENT_FILTER 98 /* user */ #define _SC_TRACE_INHERIT 99 /* user */ #define _SC_TRACE_LOG 100 /* user */ #define _SC_TTY_NAME_MAX 101 /* user */ #define _SC_TYPED_MEMORY_OBJECTS 102 #define _SC_V6_ILP32_OFF32 103 /* user */ #define _SC_V6_ILP32_OFFBIG 104 /* user */ #define _SC_V6_LP64_OFF64 105 /* user */ #define _SC_V6_LPBIG_OFFBIG 106 /* user */ #define _SC_IPV6 118 #define _SC_RAW_SOCKETS 119 #define _SC_SYMLOOP_MAX 120 #endif #if __XSI_VISIBLE #define _SC_ATEXIT_MAX 107 /* user */ #define _SC_IOV_MAX 56 #define _SC_PAGE_SIZE _SC_PAGESIZE #define _SC_XOPEN_CRYPT 108 /* user */ #define _SC_XOPEN_ENH_I18N 109 /* user */ #define _SC_XOPEN_LEGACY 110 /* user */ #define _SC_XOPEN_REALTIME 111 #define _SC_XOPEN_REALTIME_THREADS 112 #define _SC_XOPEN_SHM 113 #define _SC_XOPEN_STREAMS 114 #define _SC_XOPEN_UNIX 115 #define _SC_XOPEN_VERSION 116 #define _SC_XOPEN_XCU_VERSION 117 /* user */ #endif #if __BSD_VISIBLE #define _SC_NPROCESSORS_CONF 57 #define _SC_NPROCESSORS_ONLN 58 #define _SC_CPUSET_SIZE 122 #endif /* Extensions found in Solaris and Linux. */ #define _SC_PHYS_PAGES 121 /* Keys for the confstr(3) function. */ #if __POSIX_VISIBLE >= 199209 #define _CS_PATH 1 /* default value of PATH */ #endif #if __POSIX_VISIBLE >= 200112 #define _CS_POSIX_V6_ILP32_OFF32_CFLAGS 2 #define _CS_POSIX_V6_ILP32_OFF32_LDFLAGS 3 #define _CS_POSIX_V6_ILP32_OFF32_LIBS 4 #define _CS_POSIX_V6_ILP32_OFFBIG_CFLAGS 5 #define _CS_POSIX_V6_ILP32_OFFBIG_LDFLAGS 6 #define _CS_POSIX_V6_ILP32_OFFBIG_LIBS 7 #define _CS_POSIX_V6_LP64_OFF64_CFLAGS 8 #define _CS_POSIX_V6_LP64_OFF64_LDFLAGS 9 #define _CS_POSIX_V6_LP64_OFF64_LIBS 10 #define _CS_POSIX_V6_LPBIG_OFFBIG_CFLAGS 11 #define _CS_POSIX_V6_LPBIG_OFFBIG_LDFLAGS 12 #define _CS_POSIX_V6_LPBIG_OFFBIG_LIBS 13 #define _CS_POSIX_V6_WIDTH_RESTRICTED_ENVS 14 #endif __BEGIN_DECLS /* 1003.1-1990 */ void _exit(int) __dead2; int access(const char *, int); unsigned int alarm(unsigned int); int chdir(const char *); int chown(const char *, uid_t, gid_t); int close(int); void closefrom(int); int dup(int); int dup2(int, int); int execl(const char *, const char *, ...) __null_sentinel; int execle(const char *, const char *, ...); int execlp(const char *, const char *, ...) __null_sentinel; int execv(const char *, char * const *); int execve(const char *, char * const *, char * const *); int execvp(const char *, char * const *); pid_t fork(void); long fpathconf(int, int); char *getcwd(char *, size_t); gid_t getegid(void); uid_t geteuid(void); gid_t getgid(void); int getgroups(int, gid_t []); char *getlogin(void); pid_t getpgrp(void); pid_t getpid(void); pid_t getppid(void); uid_t getuid(void); int isatty(int); int link(const char *, const char *); #ifndef _LSEEK_DECLARED #define _LSEEK_DECLARED off_t lseek(int, off_t, int); #endif long pathconf(const char *, int); int pause(void); int pipe(int *); ssize_t read(int, void *, size_t); int rmdir(const char *); int setgid(gid_t); int setpgid(pid_t, pid_t); pid_t setsid(void); int setuid(uid_t); unsigned int sleep(unsigned int); long sysconf(int); pid_t tcgetpgrp(int); int tcsetpgrp(int, pid_t); char *ttyname(int); int ttyname_r(int, char *, size_t); int unlink(const char *); ssize_t write(int, const void *, size_t); /* 1003.2-1992 */ #if __POSIX_VISIBLE >= 199209 || __XSI_VISIBLE size_t confstr(int, char *, size_t); #ifndef _GETOPT_DECLARED #define _GETOPT_DECLARED int getopt(int, char * const [], const char *); extern char *optarg; /* getopt(3) external variables */ extern int optind, opterr, optopt; #endif /* _GETOPT_DECLARED */ #endif /* ISO/IEC 9945-1: 1996 */ #if __POSIX_VISIBLE >= 199506 || __XSI_VISIBLE int fsync(int); /* * ftruncate() was in the POSIX Realtime Extension (it's used for shared * memory), but truncate() was not. */ #ifndef _FTRUNCATE_DECLARED #define _FTRUNCATE_DECLARED int ftruncate(int, off_t); #endif #endif #if __POSIX_VISIBLE >= 199506 int getlogin_r(char *, int); #endif /* 1003.1-2001 */ #if __POSIX_VISIBLE >= 200112 || __XSI_VISIBLE int fchown(int, uid_t, gid_t); ssize_t readlink(const char * __restrict, char * __restrict, size_t); #endif #if __POSIX_VISIBLE >= 200112 int gethostname(char *, size_t); int setegid(gid_t); int seteuid(uid_t); #endif /* 1003.1-2008 */ #if __POSIX_VISIBLE >= 200809 || __XSI_VISIBLE int getsid(pid_t _pid); int fchdir(int); int getpgid(pid_t _pid); int lchown(const char *, uid_t, gid_t); ssize_t pread(int, void *, size_t, off_t); ssize_t pwrite(int, const void *, size_t, off_t); /* See comment at ftruncate() above. */ #ifndef _TRUNCATE_DECLARED #define _TRUNCATE_DECLARED int truncate(const char *, off_t); #endif #endif /* __POSIX_VISIBLE >= 200809 || __XSI_VISIBLE */ #if __POSIX_VISIBLE >= 200809 int faccessat(int, const char *, int, int); int fchownat(int, const char *, uid_t, gid_t, int); int fexecve(int, char *const [], char *const []); int linkat(int, const char *, int, const char *, int); ssize_t readlinkat(int, const char * __restrict, char * __restrict, size_t); int symlinkat(const char *, int, const char *); int unlinkat(int, const char *, int); #endif /* __POSIX_VISIBLE >= 200809 */ /* * symlink() was originally in POSIX.1a, which was withdrawn after * being overtaken by events (1003.1-2001). It was in XPG4.2, and of * course has been in BSD since 4.2. */ #if __POSIX_VISIBLE >= 200112 || __XSI_VISIBLE >= 402 int symlink(const char * __restrict, const char * __restrict); #endif /* X/Open System Interfaces */ #if __XSI_VISIBLE char *crypt(const char *, const char *); /* char *ctermid(char *); */ /* XXX ??? */ int encrypt(char *, int); long gethostid(void); int lockf(int, int, off_t); int nice(int); int setregid(gid_t, gid_t); int setreuid(uid_t, uid_t); #ifndef _SWAB_DECLARED #define _SWAB_DECLARED void swab(const void * __restrict, void * __restrict, ssize_t); #endif /* _SWAB_DECLARED */ void sync(void); #endif /* __XSI_VISIBLE */ #if (__XSI_VISIBLE && __XSI_VISIBLE <= 500) || __BSD_VISIBLE int brk(const void *); int chroot(const char *); int getdtablesize(void); int getpagesize(void) __pure2; char *getpass(const char *); void *sbrk(intptr_t); #endif #if (__XSI_VISIBLE && __XSI_VISIBLE <= 600) || __BSD_VISIBLE char *getwd(char *); /* obsoleted by getcwd() */ useconds_t ualarm(useconds_t, useconds_t); int usleep(useconds_t); pid_t vfork(void) __returns_twice; #endif #if __BSD_VISIBLE struct timeval; /* select(2) */ int acct(const char *); int async_daemon(void); int check_utility_compat(const char *); const char * crypt_get_format(void); int crypt_set_format(const char *); int des_cipher(const char *, char *, long, int); int des_setkey(const char *key); int dup3(int, int, int); int eaccess(const char *, int); void endusershell(void); int exect(const char *, char * const *, char * const *); int execvP(const char *, const char *, char * const *); int feature_present(const char *); char *fflagstostr(u_long); int getdomainname(char *, int); int getgrouplist(const char *, gid_t, gid_t *, int *); int getloginclass(char *, size_t); mode_t getmode(const void *, mode_t); int getosreldate(void); int getpeereid(int, uid_t *, gid_t *); int getresgid(gid_t *, gid_t *, gid_t *); int getresuid(uid_t *, uid_t *, uid_t *); char *getusershell(void); int initgroups(const char *, gid_t); int iruserok(unsigned long, int, const char *, const char *); int iruserok_sa(const void *, int, int, const char *, const char *); int issetugid(void); void __FreeBSD_libc_enter_restricted_mode(void); long lpathconf(const char *, int); #ifndef _MKDTEMP_DECLARED char *mkdtemp(char *); #define _MKDTEMP_DECLARED #endif #ifndef _MKNOD_DECLARED int mknod(const char *, mode_t, dev_t); #define _MKNOD_DECLARED #endif #ifndef _MKSTEMP_DECLARED int mkstemp(char *); #define _MKSTEMP_DECLARED #endif int mkstemps(char *, int); #ifndef _MKTEMP_DECLARED char *mktemp(char *); #define _MKTEMP_DECLARED #endif int nfssvc(int, void *); int nlm_syscall(int, int, int, char **); int pipe2(int *, int); int profil(char *, size_t, vm_offset_t, int); int rcmd(char **, int, const char *, const char *, const char *, int *); int rcmd_af(char **, int, const char *, const char *, const char *, int *, int); int rcmdsh(char **, int, const char *, const char *, const char *, const char *); char *re_comp(const char *); int re_exec(const char *); int reboot(int); int revoke(const char *); pid_t rfork(int); pid_t rfork_thread(int, void *, int (*)(void *), void *); int rresvport(int *); int rresvport_af(int *, int); int ruserok(const char *, int, const char *, const char *); #if __BSD_VISIBLE #ifndef _SELECT_DECLARED #define _SELECT_DECLARED int select(int, fd_set *, fd_set *, fd_set *, struct timeval *); #endif #endif int setdomainname(const char *, int); int setgroups(int, const gid_t *); void sethostid(long); int sethostname(const char *, int); #ifndef _SETKEY_DECLARED int setkey(const char *); #define _SETKEY_DECLARED #endif int setlogin(const char *); int setloginclass(const char *); void *setmode(const char *); int setpgrp(pid_t, pid_t); /* obsoleted by setpgid() */ void setproctitle(const char *_fmt, ...) __printf0like(1, 2); int setresgid(gid_t, gid_t, gid_t); int setresuid(uid_t, uid_t, uid_t); int setrgid(gid_t); int setruid(uid_t); void setusershell(void); int strtofflags(char **, u_long *, u_long *); int swapon(const char *); int swapoff(const char *); int syscall(int, ...); off_t __syscall(quad_t, ...); int undelete(const char *); int unwhiteout(const char *); void *valloc(size_t); /* obsoleted by malloc() */ #ifndef _OPTRESET_DECLARED #define _OPTRESET_DECLARED extern int optreset; /* getopt(3) external variable */ #endif #endif /* __BSD_VISIBLE */ __END_DECLS #endif /* !_UNISTD_H_ */ Index: head/lib/libthr/thread/Makefile.inc =================================================================== --- head/lib/libthr/thread/Makefile.inc (revision 296161) +++ head/lib/libthr/thread/Makefile.inc (revision 296162) @@ -1,59 +1,60 @@ # $FreeBSD$ # thr sources .PATH: ${.CURDIR}/thread SRCS+= \ thr_affinity.c \ thr_attr.c \ thr_barrier.c \ thr_barrierattr.c \ thr_cancel.c \ thr_clean.c \ thr_concurrency.c \ thr_cond.c \ thr_condattr.c \ thr_create.c \ thr_ctrdtr.c \ thr_detach.c \ thr_equal.c \ thr_event.c \ thr_exit.c \ thr_fork.c \ thr_getprio.c \ thr_getcpuclockid.c \ thr_getschedparam.c \ thr_getthreadid_np.c \ thr_info.c \ thr_init.c \ thr_join.c \ thr_list.c \ thr_kern.c \ thr_kill.c \ thr_main_np.c \ thr_multi_np.c \ thr_mutex.c \ thr_mutexattr.c \ thr_once.c \ thr_printf.c \ + thr_pshared.c \ thr_pspinlock.c \ thr_resume_np.c \ thr_rtld.c \ thr_rwlock.c \ thr_rwlockattr.c \ thr_self.c \ thr_sem.c \ thr_setprio.c \ thr_setschedparam.c \ thr_sig.c \ thr_single_np.c \ thr_sleepq.c \ thr_spec.c \ thr_spinlock.c \ thr_stack.c \ thr_syscalls.c \ thr_suspend_np.c \ thr_switch_np.c \ thr_symbols.c \ thr_umtx.c \ thr_yield.c Index: head/lib/libthr/thread/thr_barrier.c =================================================================== --- head/lib/libthr/thread/thr_barrier.c (revision 296161) +++ head/lib/libthr/thread/thr_barrier.c (revision 296162) @@ -1,135 +1,168 @@ /*- * Copyright (c) 2003 David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "namespace.h" #include #include #include #include "un-namespace.h" #include "thr_private.h" __weak_reference(_pthread_barrier_init, pthread_barrier_init); __weak_reference(_pthread_barrier_wait, pthread_barrier_wait); __weak_reference(_pthread_barrier_destroy, pthread_barrier_destroy); int _pthread_barrier_destroy(pthread_barrier_t *barrier) { - pthread_barrier_t bar; - struct pthread *curthread; + pthread_barrier_t bar; + struct pthread *curthread; + int pshared; if (barrier == NULL || *barrier == NULL) return (EINVAL); + if (*barrier == THR_PSHARED_PTR) { + bar = __thr_pshared_offpage(barrier, 0); + if (bar == NULL) { + *barrier = NULL; + return (0); + } + pshared = 1; + } else { + bar = *barrier; + pshared = 0; + } curthread = _get_curthread(); - bar = *barrier; THR_UMUTEX_LOCK(curthread, &bar->b_lock); if (bar->b_destroying) { THR_UMUTEX_UNLOCK(curthread, &bar->b_lock); return (EBUSY); } bar->b_destroying = 1; do { if (bar->b_waiters > 0) { bar->b_destroying = 0; THR_UMUTEX_UNLOCK(curthread, &bar->b_lock); return (EBUSY); } if (bar->b_refcount != 0) { _thr_ucond_wait(&bar->b_cv, &bar->b_lock, NULL, 0); THR_UMUTEX_LOCK(curthread, &bar->b_lock); } else break; } while (1); bar->b_destroying = 0; THR_UMUTEX_UNLOCK(curthread, &bar->b_lock); *barrier = NULL; - free(bar); + if (pshared) + __thr_pshared_destroy(barrier); + else + free(bar); return (0); } int _pthread_barrier_init(pthread_barrier_t *barrier, - const pthread_barrierattr_t *attr, unsigned count) + const pthread_barrierattr_t *attr, unsigned count) { - pthread_barrier_t bar; + pthread_barrier_t bar; + int pshared; - (void)attr; - if (barrier == NULL || count <= 0) return (EINVAL); - bar = calloc(1, sizeof(struct pthread_barrier)); - if (bar == NULL) - return (ENOMEM); + if (attr == NULL || *attr == NULL || + (*attr)->pshared == PTHREAD_PROCESS_PRIVATE) { + bar = calloc(1, sizeof(struct pthread_barrier)); + if (bar == NULL) + return (ENOMEM); + *barrier = bar; + pshared = 0; + } else { + bar = __thr_pshared_offpage(barrier, 1); + if (bar == NULL) + return (EFAULT); + *barrier = THR_PSHARED_PTR; + pshared = 1; + } _thr_umutex_init(&bar->b_lock); _thr_ucond_init(&bar->b_cv); - bar->b_count = count; - *barrier = bar; - + if (pshared) { + bar->b_lock.m_flags |= USYNC_PROCESS_SHARED; + bar->b_cv.c_flags |= USYNC_PROCESS_SHARED; + } + bar->b_count = count; return (0); } int _pthread_barrier_wait(pthread_barrier_t *barrier) { - struct pthread *curthread = _get_curthread(); + struct pthread *curthread; pthread_barrier_t bar; int64_t cycle; int ret; if (barrier == NULL || *barrier == NULL) return (EINVAL); - bar = *barrier; + if (*barrier == THR_PSHARED_PTR) { + bar = __thr_pshared_offpage(barrier, 0); + if (bar == NULL) + return (EINVAL); + } else { + bar = *barrier; + } + curthread = _get_curthread(); THR_UMUTEX_LOCK(curthread, &bar->b_lock); if (++bar->b_waiters == bar->b_count) { /* Current thread is lastest thread */ bar->b_waiters = 0; bar->b_cycle++; _thr_ucond_broadcast(&bar->b_cv); THR_UMUTEX_UNLOCK(curthread, &bar->b_lock); ret = PTHREAD_BARRIER_SERIAL_THREAD; } else { cycle = bar->b_cycle; bar->b_refcount++; do { _thr_ucond_wait(&bar->b_cv, &bar->b_lock, NULL, 0); THR_UMUTEX_LOCK(curthread, &bar->b_lock); /* test cycle to avoid bogus wakeup */ } while (cycle == bar->b_cycle); if (--bar->b_refcount == 0 && bar->b_destroying) _thr_ucond_broadcast(&bar->b_cv); THR_UMUTEX_UNLOCK(curthread, &bar->b_lock); ret = 0; } return (ret); } Index: head/lib/libthr/thread/thr_barrierattr.c =================================================================== --- head/lib/libthr/thread/thr_barrierattr.c (revision 296161) +++ head/lib/libthr/thread/thr_barrierattr.c (revision 296162) @@ -1,96 +1,94 @@ /* * Copyright (c) 2003 David Xu . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * $FreeBSD$ */ #include "namespace.h" #include #include #include #include "un-namespace.h" #include "thr_private.h" __weak_reference(_pthread_barrierattr_destroy, pthread_barrierattr_destroy); __weak_reference(_pthread_barrierattr_init, pthread_barrierattr_init); __weak_reference(_pthread_barrierattr_setpshared, pthread_barrierattr_setpshared); __weak_reference(_pthread_barrierattr_getpshared, pthread_barrierattr_getpshared); int _pthread_barrierattr_destroy(pthread_barrierattr_t *attr) { if (attr == NULL || *attr == NULL) return (EINVAL); free(*attr); return (0); } int _pthread_barrierattr_getpshared(const pthread_barrierattr_t *attr, - int *pshared) + int *pshared) { if (attr == NULL || *attr == NULL) return (EINVAL); *pshared = (*attr)->pshared; return (0); } int _pthread_barrierattr_init(pthread_barrierattr_t *attr) { if (attr == NULL) return (EINVAL); if ((*attr = malloc(sizeof(struct pthread_barrierattr))) == NULL) return (ENOMEM); (*attr)->pshared = PTHREAD_PROCESS_PRIVATE; return (0); } int _pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared) { - if (attr == NULL || *attr == NULL) - return (EINVAL); - - /* Only PTHREAD_PROCESS_PRIVATE is supported. */ - if (pshared != PTHREAD_PROCESS_PRIVATE) + if (attr == NULL || *attr == NULL || + (pshared != PTHREAD_PROCESS_PRIVATE && + pshared != PTHREAD_PROCESS_SHARED)) return (EINVAL); (*attr)->pshared = pshared; return (0); } Index: head/lib/libthr/thread/thr_cond.c =================================================================== --- head/lib/libthr/thread/thr_cond.c (revision 296161) +++ head/lib/libthr/thread/thr_cond.c (revision 296162) @@ -1,488 +1,519 @@ /* * Copyright (c) 2005 David Xu + * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include "namespace.h" #include #include #include #include #include #include "un-namespace.h" #include "thr_private.h" /* * Prototypes */ int __pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); int __pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec * abstime); static int cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr); static int cond_wait_common(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime, int cancel); static int cond_signal_common(pthread_cond_t *cond); static int cond_broadcast_common(pthread_cond_t *cond); /* * Double underscore versions are cancellation points. Single underscore * versions are not and are provided for libc internal usage (which * shouldn't introduce cancellation points). */ __weak_reference(__pthread_cond_wait, pthread_cond_wait); __weak_reference(__pthread_cond_timedwait, pthread_cond_timedwait); __weak_reference(_pthread_cond_init, pthread_cond_init); __weak_reference(_pthread_cond_destroy, pthread_cond_destroy); __weak_reference(_pthread_cond_signal, pthread_cond_signal); __weak_reference(_pthread_cond_broadcast, pthread_cond_broadcast); #define CV_PSHARED(cvp) (((cvp)->__flags & USYNC_PROCESS_SHARED) != 0) +static void +cond_init_body(struct pthread_cond *cvp, const struct pthread_cond_attr *cattr) +{ + + if (cattr == NULL) { + cvp->__clock_id = CLOCK_REALTIME; + } else { + if (cattr->c_pshared) + cvp->__flags |= USYNC_PROCESS_SHARED; + cvp->__clock_id = cattr->c_clockid; + } +} + static int cond_init(pthread_cond_t *cond, const pthread_condattr_t *cond_attr) { - struct pthread_cond *cvp; - int error = 0; + struct pthread_cond *cvp; + const struct pthread_cond_attr *cattr; + int pshared; - if ((cvp = (pthread_cond_t) - calloc(1, sizeof(struct pthread_cond))) == NULL) { - error = ENOMEM; + cattr = cond_attr != NULL ? *cond_attr : NULL; + if (cattr == NULL || cattr->c_pshared == PTHREAD_PROCESS_PRIVATE) { + pshared = 0; + cvp = calloc(1, sizeof(struct pthread_cond)); + if (cvp == NULL) + return (ENOMEM); } else { - /* - * Initialise the condition variable structure: - */ - if (cond_attr == NULL || *cond_attr == NULL) { - cvp->__clock_id = CLOCK_REALTIME; - } else { - if ((*cond_attr)->c_pshared) - cvp->__flags |= USYNC_PROCESS_SHARED; - cvp->__clock_id = (*cond_attr)->c_clockid; - } - *cond = cvp; + pshared = 1; + cvp = __thr_pshared_offpage(cond, 1); + if (cvp == NULL) + return (EFAULT); } - return (error); + + /* + * Initialise the condition variable structure: + */ + cond_init_body(cvp, cattr); + *cond = pshared ? THR_PSHARED_PTR : cvp; + return (0); } static int init_static(struct pthread *thread, pthread_cond_t *cond) { int ret; THR_LOCK_ACQUIRE(thread, &_cond_static_lock); if (*cond == NULL) ret = cond_init(cond, NULL); else ret = 0; THR_LOCK_RELEASE(thread, &_cond_static_lock); return (ret); } #define CHECK_AND_INIT_COND \ - if (__predict_false((cvp = (*cond)) <= THR_COND_DESTROYED)) { \ + if (*cond == THR_PSHARED_PTR) { \ + cvp = __thr_pshared_offpage(cond, 0); \ + if (cvp == NULL) \ + return (EINVAL); \ + } else if (__predict_false((cvp = (*cond)) <= THR_COND_DESTROYED)) { \ if (cvp == THR_COND_INITIALIZER) { \ int ret; \ ret = init_static(_get_curthread(), cond); \ if (ret) \ return (ret); \ } else if (cvp == THR_COND_DESTROYED) { \ return (EINVAL); \ } \ cvp = *cond; \ } int _pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *cond_attr) { *cond = NULL; return (cond_init(cond, cond_attr)); } int _pthread_cond_destroy(pthread_cond_t *cond) { - struct pthread_cond *cvp; - int error = 0; + struct pthread_cond *cvp; + int error; - if ((cvp = *cond) == THR_COND_INITIALIZER) - error = 0; - else if (cvp == THR_COND_DESTROYED) + error = 0; + if (*cond == THR_PSHARED_PTR) { + cvp = __thr_pshared_offpage(cond, 0); + if (cvp != NULL) + __thr_pshared_destroy(cond); + *cond = THR_COND_DESTROYED; + } else if ((cvp = *cond) == THR_COND_INITIALIZER) { + /* nothing */ + } else if (cvp == THR_COND_DESTROYED) { error = EINVAL; - else { + } else { cvp = *cond; *cond = THR_COND_DESTROYED; - - /* - * Free the memory allocated for the condition - * variable structure: - */ free(cvp); } return (error); } /* * Cancellation behavior: * Thread may be canceled at start, if thread is canceled, it means it * did not get a wakeup from pthread_cond_signal(), otherwise, it is * not canceled. * Thread cancellation never cause wakeup from pthread_cond_signal() * to be lost. */ static int cond_wait_kernel(struct pthread_cond *cvp, struct pthread_mutex *mp, const struct timespec *abstime, int cancel) { struct pthread *curthread = _get_curthread(); int recurse; int error, error2 = 0; error = _mutex_cv_detach(mp, &recurse); if (error != 0) return (error); if (cancel) { _thr_cancel_enter2(curthread, 0); error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters, (struct umutex *)&mp->m_lock, abstime, CVWAIT_ABSTIME|CVWAIT_CLOCKID); _thr_cancel_leave(curthread, 0); } else { error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters, (struct umutex *)&mp->m_lock, abstime, CVWAIT_ABSTIME|CVWAIT_CLOCKID); } /* * Note that PP mutex and ROBUST mutex may return * interesting error codes. */ if (error == 0) { error2 = _mutex_cv_lock(mp, recurse); } else if (error == EINTR || error == ETIMEDOUT) { error2 = _mutex_cv_lock(mp, recurse); if (error2 == 0 && cancel) _thr_testcancel(curthread); if (error == EINTR) error = 0; } else { /* We know that it didn't unlock the mutex. */ error2 = _mutex_cv_attach(mp, recurse); if (error2 == 0 && cancel) _thr_testcancel(curthread); } return (error2 != 0 ? error2 : error); } /* * Thread waits in userland queue whenever possible, when thread * is signaled or broadcasted, it is removed from the queue, and * is saved in curthread's defer_waiters[] buffer, but won't be * woken up until mutex is unlocked. */ static int cond_wait_user(struct pthread_cond *cvp, struct pthread_mutex *mp, const struct timespec *abstime, int cancel) { struct pthread *curthread = _get_curthread(); struct sleepqueue *sq; int recurse; int error; int defered; if (curthread->wchan != NULL) PANIC("thread was already on queue."); if (cancel) _thr_testcancel(curthread); _sleepq_lock(cvp); /* * set __has_user_waiters before unlocking mutex, this allows * us to check it without locking in pthread_cond_signal(). */ cvp->__has_user_waiters = 1; defered = 0; (void)_mutex_cv_unlock(mp, &recurse, &defered); curthread->mutex_obj = mp; _sleepq_add(cvp, curthread); for(;;) { _thr_clear_wake(curthread); _sleepq_unlock(cvp); if (defered) { defered = 0; if ((mp->m_lock.m_owner & UMUTEX_CONTESTED) == 0) (void)_umtx_op_err(&mp->m_lock, UMTX_OP_MUTEX_WAKE2, mp->m_lock.m_flags, 0, 0); } if (curthread->nwaiter_defer > 0) { _thr_wake_all(curthread->defer_waiters, curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } if (cancel) { _thr_cancel_enter2(curthread, 0); error = _thr_sleep(curthread, cvp->__clock_id, abstime); _thr_cancel_leave(curthread, 0); } else { error = _thr_sleep(curthread, cvp->__clock_id, abstime); } _sleepq_lock(cvp); if (curthread->wchan == NULL) { error = 0; break; } else if (cancel && SHOULD_CANCEL(curthread)) { sq = _sleepq_lookup(cvp); cvp->__has_user_waiters = _sleepq_remove(sq, curthread); _sleepq_unlock(cvp); curthread->mutex_obj = NULL; _mutex_cv_lock(mp, recurse); if (!THR_IN_CRITICAL(curthread)) _pthread_exit(PTHREAD_CANCELED); else /* this should not happen */ return (0); } else if (error == ETIMEDOUT) { sq = _sleepq_lookup(cvp); cvp->__has_user_waiters = _sleepq_remove(sq, curthread); break; } } _sleepq_unlock(cvp); curthread->mutex_obj = NULL; _mutex_cv_lock(mp, recurse); return (error); } static int cond_wait_common(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime, int cancel) { struct pthread *curthread = _get_curthread(); struct pthread_cond *cvp; struct pthread_mutex *mp; int error; CHECK_AND_INIT_COND - mp = *mutex; + if (*mutex == THR_PSHARED_PTR) { + mp = __thr_pshared_offpage(mutex, 0); + if (mp == NULL) + return (EINVAL); + } else { + mp = *mutex; + } if ((error = _mutex_owned(curthread, mp)) != 0) return (error); if (curthread->attr.sched_policy != SCHED_OTHER || (mp->m_lock.m_flags & (UMUTEX_PRIO_PROTECT|UMUTEX_PRIO_INHERIT| USYNC_PROCESS_SHARED)) != 0 || (cvp->__flags & USYNC_PROCESS_SHARED) != 0) return cond_wait_kernel(cvp, mp, abstime, cancel); else return cond_wait_user(cvp, mp, abstime, cancel); } int _pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { return (cond_wait_common(cond, mutex, NULL, 0)); } int __pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { return (cond_wait_common(cond, mutex, NULL, 1)); } int _pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec * abstime) { if (abstime == NULL || abstime->tv_sec < 0 || abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) return (EINVAL); return (cond_wait_common(cond, mutex, abstime, 0)); } int __pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime) { if (abstime == NULL || abstime->tv_sec < 0 || abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) return (EINVAL); return (cond_wait_common(cond, mutex, abstime, 1)); } static int cond_signal_common(pthread_cond_t *cond) { struct pthread *curthread = _get_curthread(); struct pthread *td; struct pthread_cond *cvp; struct pthread_mutex *mp; struct sleepqueue *sq; int *waddr; int pshared; /* * If the condition variable is statically initialized, perform dynamic * initialization. */ CHECK_AND_INIT_COND pshared = CV_PSHARED(cvp); _thr_ucond_signal((struct ucond *)&cvp->__has_kern_waiters); if (pshared || cvp->__has_user_waiters == 0) return (0); curthread = _get_curthread(); waddr = NULL; _sleepq_lock(cvp); sq = _sleepq_lookup(cvp); if (sq == NULL) { _sleepq_unlock(cvp); return (0); } td = _sleepq_first(sq); mp = td->mutex_obj; cvp->__has_user_waiters = _sleepq_remove(sq, td); - if (mp->m_owner == curthread) { + if (mp->m_owner == TID(curthread)) { if (curthread->nwaiter_defer >= MAX_DEFER_WAITERS) { _thr_wake_all(curthread->defer_waiters, curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } curthread->defer_waiters[curthread->nwaiter_defer++] = &td->wake_addr->value; mp->m_flags |= PMUTEX_FLAG_DEFERED; } else { waddr = &td->wake_addr->value; } _sleepq_unlock(cvp); if (waddr != NULL) _thr_set_wake(waddr); return (0); } struct broadcast_arg { struct pthread *curthread; unsigned int *waddrs[MAX_DEFER_WAITERS]; int count; }; static void drop_cb(struct pthread *td, void *arg) { struct broadcast_arg *ba = arg; struct pthread_mutex *mp; struct pthread *curthread = ba->curthread; mp = td->mutex_obj; - if (mp->m_owner == curthread) { + if (mp->m_owner == TID(curthread)) { if (curthread->nwaiter_defer >= MAX_DEFER_WAITERS) { _thr_wake_all(curthread->defer_waiters, curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } curthread->defer_waiters[curthread->nwaiter_defer++] = &td->wake_addr->value; mp->m_flags |= PMUTEX_FLAG_DEFERED; } else { if (ba->count >= MAX_DEFER_WAITERS) { _thr_wake_all(ba->waddrs, ba->count); ba->count = 0; } ba->waddrs[ba->count++] = &td->wake_addr->value; } } static int cond_broadcast_common(pthread_cond_t *cond) { int pshared; struct pthread_cond *cvp; struct sleepqueue *sq; struct broadcast_arg ba; /* * If the condition variable is statically initialized, perform dynamic * initialization. */ CHECK_AND_INIT_COND pshared = CV_PSHARED(cvp); _thr_ucond_broadcast((struct ucond *)&cvp->__has_kern_waiters); if (pshared || cvp->__has_user_waiters == 0) return (0); ba.curthread = _get_curthread(); ba.count = 0; _sleepq_lock(cvp); sq = _sleepq_lookup(cvp); if (sq == NULL) { _sleepq_unlock(cvp); return (0); } _sleepq_drop(sq, drop_cb, &ba); cvp->__has_user_waiters = 0; _sleepq_unlock(cvp); if (ba.count > 0) _thr_wake_all(ba.waddrs, ba.count); return (0); } int _pthread_cond_signal(pthread_cond_t * cond) { return (cond_signal_common(cond)); } int _pthread_cond_broadcast(pthread_cond_t * cond) { return (cond_broadcast_common(cond)); } Index: head/lib/libthr/thread/thr_condattr.c =================================================================== --- head/lib/libthr/thread/thr_condattr.c (revision 296161) +++ head/lib/libthr/thread/thr_condattr.c (revision 296162) @@ -1,124 +1,125 @@ /* * Copyright (c) 1997 John Birrell . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "namespace.h" #include #include #include #include #include "un-namespace.h" #include "thr_private.h" __weak_reference(_pthread_condattr_init, pthread_condattr_init); __weak_reference(_pthread_condattr_destroy, pthread_condattr_destroy); __weak_reference(_pthread_condattr_getclock, pthread_condattr_getclock); __weak_reference(_pthread_condattr_setclock, pthread_condattr_setclock); __weak_reference(_pthread_condattr_getpshared, pthread_condattr_getpshared); __weak_reference(_pthread_condattr_setpshared, pthread_condattr_setpshared); int _pthread_condattr_init(pthread_condattr_t *attr) { pthread_condattr_t pattr; int ret; if ((pattr = (pthread_condattr_t) malloc(sizeof(struct pthread_cond_attr))) == NULL) { ret = ENOMEM; } else { memcpy(pattr, &_pthread_condattr_default, sizeof(struct pthread_cond_attr)); *attr = pattr; ret = 0; } return (ret); } int _pthread_condattr_destroy(pthread_condattr_t *attr) { int ret; if (attr == NULL || *attr == NULL) { ret = EINVAL; } else { free(*attr); *attr = NULL; ret = 0; } return(ret); } int _pthread_condattr_getclock(const pthread_condattr_t *attr, clockid_t *clock_id) { if (attr == NULL || *attr == NULL) return (EINVAL); *clock_id = (*attr)->c_clockid; return (0); } int _pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock_id) { if (attr == NULL || *attr == NULL) return (EINVAL); if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_VIRTUAL && clock_id != CLOCK_PROF && clock_id != CLOCK_MONOTONIC) { return (EINVAL); } (*attr)->c_clockid = clock_id; return (0); } int _pthread_condattr_getpshared(const pthread_condattr_t *attr, int *pshared) { + if (attr == NULL || *attr == NULL) return (EINVAL); - - *pshared = PTHREAD_PROCESS_PRIVATE; + *pshared = (*attr)->c_pshared; return (0); } int _pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared) { - if (attr == NULL || *attr == NULL) - return (EINVAL); - if (pshared != PTHREAD_PROCESS_PRIVATE) + if (attr == NULL || *attr == NULL || + (pshared != PTHREAD_PROCESS_PRIVATE && + pshared != PTHREAD_PROCESS_SHARED)) return (EINVAL); + (*attr)->c_pshared = pshared; return (0); } Index: head/lib/libthr/thread/thr_create.c =================================================================== --- head/lib/libthr/thread/thr_create.c (revision 296161) +++ head/lib/libthr/thread/thr_create.c (revision 296162) @@ -1,292 +1,292 @@ /* * Copyright (c) 2003 Daniel M. Eischen * Copyright (c) 2005, David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include "namespace.h" #include #include #include #include #include #include #include #include #include #include #include "un-namespace.h" #include "libc_private.h" #include "thr_private.h" static int create_stack(struct pthread_attr *pattr); static void thread_start(struct pthread *curthread); __weak_reference(_pthread_create, pthread_create); int _pthread_create(pthread_t * thread, const pthread_attr_t * attr, void *(*start_routine) (void *), void *arg) { struct pthread *curthread, *new_thread; struct thr_param param; struct sched_param sched_param; struct rtprio rtp; - int ret = 0, locked, create_suspended; sigset_t set, oset; - cpuset_t *cpusetp = NULL; - int cpusetsize = 0; - int old_stack_prot; + cpuset_t *cpusetp; + int i, cpusetsize, create_suspended, locked, old_stack_prot, ret; + cpusetp = NULL; + ret = cpusetsize = 0; _thr_check_init(); /* * Tell libc and others now they need lock to protect their data. */ if (_thr_isthreaded() == 0) { _malloc_first_thread(); if (_thr_setthreaded(1)) return (EAGAIN); } curthread = _get_curthread(); if ((new_thread = _thr_alloc(curthread)) == NULL) return (EAGAIN); memset(¶m, 0, sizeof(param)); if (attr == NULL || *attr == NULL) /* Use the default thread attributes: */ new_thread->attr = _pthread_attr_default; else { new_thread->attr = *(*attr); cpusetp = new_thread->attr.cpuset; cpusetsize = new_thread->attr.cpusetsize; new_thread->attr.cpuset = NULL; new_thread->attr.cpusetsize = 0; } if (new_thread->attr.sched_inherit == PTHREAD_INHERIT_SCHED) { /* inherit scheduling contention scope */ if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM) new_thread->attr.flags |= PTHREAD_SCOPE_SYSTEM; else new_thread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM; new_thread->attr.prio = curthread->attr.prio; new_thread->attr.sched_policy = curthread->attr.sched_policy; } new_thread->tid = TID_TERMINATED; old_stack_prot = _rtld_get_stack_prot(); if (create_stack(&new_thread->attr) != 0) { /* Insufficient memory to create a stack: */ _thr_free(curthread, new_thread); return (EAGAIN); } /* * Write a magic value to the thread structure * to help identify valid ones: */ new_thread->magic = THR_MAGIC; new_thread->start_routine = start_routine; new_thread->arg = arg; new_thread->cancel_enable = 1; new_thread->cancel_async = 0; /* Initialize the mutex queue: */ - TAILQ_INIT(&new_thread->mutexq); - TAILQ_INIT(&new_thread->pp_mutexq); + for (i = 0; i < TMQ_NITEMS; i++) + TAILQ_INIT(&new_thread->mq[i]); /* Initialise hooks in the thread structure: */ if (new_thread->attr.suspend == THR_CREATE_SUSPENDED) { new_thread->flags = THR_FLAGS_NEED_SUSPEND; create_suspended = 1; } else { create_suspended = 0; } new_thread->state = PS_RUNNING; if (new_thread->attr.flags & PTHREAD_CREATE_DETACHED) new_thread->flags |= THR_FLAGS_DETACHED; /* Add the new thread. */ new_thread->refcount = 1; _thr_link(curthread, new_thread); /* * Handle the race between __pthread_map_stacks_exec and * thread linkage. */ if (old_stack_prot != _rtld_get_stack_prot()) _thr_stack_fix_protection(new_thread); /* Return thread pointer eariler so that new thread can use it. */ (*thread) = new_thread; if (SHOULD_REPORT_EVENT(curthread, TD_CREATE) || cpusetp != NULL) { THR_THREAD_LOCK(curthread, new_thread); locked = 1; } else locked = 0; param.start_func = (void (*)(void *)) thread_start; param.arg = new_thread; param.stack_base = new_thread->attr.stackaddr_attr; param.stack_size = new_thread->attr.stacksize_attr; param.tls_base = (char *)new_thread->tcb; param.tls_size = sizeof(struct tcb); param.child_tid = &new_thread->tid; param.parent_tid = &new_thread->tid; param.flags = 0; if (new_thread->attr.flags & PTHREAD_SCOPE_SYSTEM) param.flags |= THR_SYSTEM_SCOPE; if (new_thread->attr.sched_inherit == PTHREAD_INHERIT_SCHED) param.rtp = NULL; else { sched_param.sched_priority = new_thread->attr.prio; _schedparam_to_rtp(new_thread->attr.sched_policy, &sched_param, &rtp); param.rtp = &rtp; } /* Schedule the new thread. */ if (create_suspended) { SIGFILLSET(set); SIGDELSET(set, SIGTRAP); __sys_sigprocmask(SIG_SETMASK, &set, &oset); new_thread->sigmask = oset; SIGDELSET(new_thread->sigmask, SIGCANCEL); } ret = thr_new(¶m, sizeof(param)); if (ret != 0) { ret = errno; /* * Translate EPROCLIM into well-known POSIX code EAGAIN. */ if (ret == EPROCLIM) ret = EAGAIN; } if (create_suspended) __sys_sigprocmask(SIG_SETMASK, &oset, NULL); if (ret != 0) { if (!locked) THR_THREAD_LOCK(curthread, new_thread); new_thread->state = PS_DEAD; new_thread->tid = TID_TERMINATED; new_thread->flags |= THR_FLAGS_DETACHED; new_thread->refcount--; if (new_thread->flags & THR_FLAGS_NEED_SUSPEND) { new_thread->cycle++; _thr_umtx_wake(&new_thread->cycle, INT_MAX, 0); } _thr_try_gc(curthread, new_thread); /* thread lock released */ atomic_add_int(&_thread_active_threads, -1); } else if (locked) { if (cpusetp != NULL) { if (cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, TID(new_thread), cpusetsize, cpusetp)) { ret = errno; /* kill the new thread */ new_thread->force_exit = 1; new_thread->flags |= THR_FLAGS_DETACHED; _thr_try_gc(curthread, new_thread); /* thread lock released */ goto out; } } _thr_report_creation(curthread, new_thread); THR_THREAD_UNLOCK(curthread, new_thread); } out: if (ret) (*thread) = 0; return (ret); } static int create_stack(struct pthread_attr *pattr) { int ret; /* Check if a stack was specified in the thread attributes: */ if ((pattr->stackaddr_attr) != NULL) { pattr->guardsize_attr = 0; pattr->flags |= THR_STACK_USER; ret = 0; } else ret = _thr_stack_alloc(pattr); return (ret); } static void thread_start(struct pthread *curthread) { sigset_t set; if (curthread->attr.suspend == THR_CREATE_SUSPENDED) set = curthread->sigmask; /* * This is used as a serialization point to allow parent * to report 'new thread' event to debugger or tweak new thread's * attributes before the new thread does real-world work. */ THR_LOCK(curthread); THR_UNLOCK(curthread); if (curthread->force_exit) _pthread_exit(PTHREAD_CANCELED); if (curthread->attr.suspend == THR_CREATE_SUSPENDED) { #if 0 /* Done in THR_UNLOCK() */ _thr_ast(curthread); #endif /* * Parent thread have stored signal mask for us, * we should restore it now. */ __sys_sigprocmask(SIG_SETMASK, &set, NULL); } #ifdef _PTHREAD_FORCED_UNWIND curthread->unwind_stackend = (char *)curthread->attr.stackaddr_attr + curthread->attr.stacksize_attr; #endif /* Run the current thread's start routine with argument: */ _pthread_exit(curthread->start_routine(curthread->arg)); /* This point should never be reached. */ PANIC("Thread has resumed after exit"); } Index: head/lib/libthr/thread/thr_init.c =================================================================== --- head/lib/libthr/thread/thr_init.c (revision 296161) +++ head/lib/libthr/thread/thr_init.c (revision 296162) @@ -1,508 +1,512 @@ /* * Copyright (c) 2003 Daniel M. Eischen * Copyright (c) 1995-1998 John Birrell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by John Birrell. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "namespace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "un-namespace.h" #include "libc_private.h" #include "thr_private.h" char *_usrstack; struct pthread *_thr_initial; int _libthr_debug; int _thread_event_mask; struct pthread *_thread_last_event; pthreadlist _thread_list = TAILQ_HEAD_INITIALIZER(_thread_list); pthreadlist _thread_gc_list = TAILQ_HEAD_INITIALIZER(_thread_gc_list); int _thread_active_threads = 1; atfork_head _thr_atfork_list = TAILQ_HEAD_INITIALIZER(_thr_atfork_list); struct urwlock _thr_atfork_lock = DEFAULT_URWLOCK; struct pthread_prio _thr_priorities[3] = { {RTP_PRIO_MIN, RTP_PRIO_MAX, 0}, /* FIFO */ {0, 0, 63}, /* OTHER */ {RTP_PRIO_MIN, RTP_PRIO_MAX, 0} /* RR */ }; struct pthread_attr _pthread_attr_default = { .sched_policy = SCHED_OTHER, .sched_inherit = PTHREAD_INHERIT_SCHED, .prio = 0, .suspend = THR_CREATE_RUNNING, .flags = PTHREAD_SCOPE_SYSTEM, .stackaddr_attr = NULL, .stacksize_attr = THR_STACK_DEFAULT, .guardsize_attr = 0, .cpusetsize = 0, .cpuset = NULL }; struct pthread_mutex_attr _pthread_mutexattr_default = { .m_type = PTHREAD_MUTEX_DEFAULT, .m_protocol = PTHREAD_PRIO_NONE, - .m_ceiling = 0 + .m_ceiling = 0, + .m_pshared = PTHREAD_PROCESS_PRIVATE, }; struct pthread_mutex_attr _pthread_mutexattr_adaptive_default = { .m_type = PTHREAD_MUTEX_ADAPTIVE_NP, .m_protocol = PTHREAD_PRIO_NONE, - .m_ceiling = 0 + .m_ceiling = 0, + .m_pshared = PTHREAD_PROCESS_PRIVATE, }; /* Default condition variable attributes: */ struct pthread_cond_attr _pthread_condattr_default = { .c_pshared = PTHREAD_PROCESS_PRIVATE, .c_clockid = CLOCK_REALTIME }; pid_t _thr_pid; int _thr_is_smp = 0; size_t _thr_guard_default; size_t _thr_stack_default = THR_STACK_DEFAULT; size_t _thr_stack_initial = THR_STACK_INITIAL; int _thr_page_size; int _thr_spinloops; int _thr_yieldloops; int _thr_queuefifo = 4; int _gc_count; struct umutex _mutex_static_lock = DEFAULT_UMUTEX; struct umutex _cond_static_lock = DEFAULT_UMUTEX; struct umutex _rwlock_static_lock = DEFAULT_UMUTEX; struct umutex _keytable_lock = DEFAULT_UMUTEX; struct urwlock _thr_list_lock = DEFAULT_URWLOCK; struct umutex _thr_event_lock = DEFAULT_UMUTEX; struct umutex _suspend_all_lock = DEFAULT_UMUTEX; struct pthread *_single_thread; int _suspend_all_cycle; int _suspend_all_waiters; int __pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *); int __pthread_mutex_lock(pthread_mutex_t *); int __pthread_mutex_trylock(pthread_mutex_t *); void _thread_init_hack(void) __attribute__ ((constructor)); static void init_private(void); static void init_main_thread(struct pthread *thread); /* * All weak references used within libc should be in this table. * This is so that static libraries will work. */ STATIC_LIB_REQUIRE(_fork); STATIC_LIB_REQUIRE(_pthread_getspecific); STATIC_LIB_REQUIRE(_pthread_key_create); STATIC_LIB_REQUIRE(_pthread_key_delete); STATIC_LIB_REQUIRE(_pthread_mutex_destroy); STATIC_LIB_REQUIRE(_pthread_mutex_init); STATIC_LIB_REQUIRE(_pthread_mutex_lock); STATIC_LIB_REQUIRE(_pthread_mutex_trylock); STATIC_LIB_REQUIRE(_pthread_mutex_unlock); STATIC_LIB_REQUIRE(_pthread_mutexattr_init); STATIC_LIB_REQUIRE(_pthread_mutexattr_destroy); STATIC_LIB_REQUIRE(_pthread_mutexattr_settype); STATIC_LIB_REQUIRE(_pthread_once); STATIC_LIB_REQUIRE(_pthread_setspecific); STATIC_LIB_REQUIRE(_raise); STATIC_LIB_REQUIRE(_sem_destroy); STATIC_LIB_REQUIRE(_sem_getvalue); STATIC_LIB_REQUIRE(_sem_init); STATIC_LIB_REQUIRE(_sem_post); STATIC_LIB_REQUIRE(_sem_timedwait); STATIC_LIB_REQUIRE(_sem_trywait); STATIC_LIB_REQUIRE(_sem_wait); STATIC_LIB_REQUIRE(_sigaction); STATIC_LIB_REQUIRE(_sigprocmask); STATIC_LIB_REQUIRE(_sigsuspend); STATIC_LIB_REQUIRE(_sigtimedwait); STATIC_LIB_REQUIRE(_sigwait); STATIC_LIB_REQUIRE(_sigwaitinfo); STATIC_LIB_REQUIRE(_spinlock); STATIC_LIB_REQUIRE(_spinlock_debug); STATIC_LIB_REQUIRE(_spinunlock); STATIC_LIB_REQUIRE(_thread_init_hack); /* * These are needed when linking statically. All references within * libgcc (and in the future libc) to these routines are weak, but * if they are not (strongly) referenced by the application or other * libraries, then the actual functions will not be loaded. */ STATIC_LIB_REQUIRE(_pthread_once); STATIC_LIB_REQUIRE(_pthread_key_create); STATIC_LIB_REQUIRE(_pthread_key_delete); STATIC_LIB_REQUIRE(_pthread_getspecific); STATIC_LIB_REQUIRE(_pthread_setspecific); STATIC_LIB_REQUIRE(_pthread_mutex_init); STATIC_LIB_REQUIRE(_pthread_mutex_destroy); STATIC_LIB_REQUIRE(_pthread_mutex_lock); STATIC_LIB_REQUIRE(_pthread_mutex_trylock); STATIC_LIB_REQUIRE(_pthread_mutex_unlock); STATIC_LIB_REQUIRE(_pthread_create); /* Pull in all symbols required by libthread_db */ STATIC_LIB_REQUIRE(_thread_state_running); #define DUAL_ENTRY(entry) \ (pthread_func_t)entry, (pthread_func_t)entry static pthread_func_t jmp_table[][2] = { {DUAL_ENTRY(_pthread_atfork)}, /* PJT_ATFORK */ {DUAL_ENTRY(_pthread_attr_destroy)}, /* PJT_ATTR_DESTROY */ {DUAL_ENTRY(_pthread_attr_getdetachstate)}, /* PJT_ATTR_GETDETACHSTATE */ {DUAL_ENTRY(_pthread_attr_getguardsize)}, /* PJT_ATTR_GETGUARDSIZE */ {DUAL_ENTRY(_pthread_attr_getinheritsched)}, /* PJT_ATTR_GETINHERITSCHED */ {DUAL_ENTRY(_pthread_attr_getschedparam)}, /* PJT_ATTR_GETSCHEDPARAM */ {DUAL_ENTRY(_pthread_attr_getschedpolicy)}, /* PJT_ATTR_GETSCHEDPOLICY */ {DUAL_ENTRY(_pthread_attr_getscope)}, /* PJT_ATTR_GETSCOPE */ {DUAL_ENTRY(_pthread_attr_getstackaddr)}, /* PJT_ATTR_GETSTACKADDR */ {DUAL_ENTRY(_pthread_attr_getstacksize)}, /* PJT_ATTR_GETSTACKSIZE */ {DUAL_ENTRY(_pthread_attr_init)}, /* PJT_ATTR_INIT */ {DUAL_ENTRY(_pthread_attr_setdetachstate)}, /* PJT_ATTR_SETDETACHSTATE */ {DUAL_ENTRY(_pthread_attr_setguardsize)}, /* PJT_ATTR_SETGUARDSIZE */ {DUAL_ENTRY(_pthread_attr_setinheritsched)}, /* PJT_ATTR_SETINHERITSCHED */ {DUAL_ENTRY(_pthread_attr_setschedparam)}, /* PJT_ATTR_SETSCHEDPARAM */ {DUAL_ENTRY(_pthread_attr_setschedpolicy)}, /* PJT_ATTR_SETSCHEDPOLICY */ {DUAL_ENTRY(_pthread_attr_setscope)}, /* PJT_ATTR_SETSCOPE */ {DUAL_ENTRY(_pthread_attr_setstackaddr)}, /* PJT_ATTR_SETSTACKADDR */ {DUAL_ENTRY(_pthread_attr_setstacksize)}, /* PJT_ATTR_SETSTACKSIZE */ {DUAL_ENTRY(_pthread_cancel)}, /* PJT_CANCEL */ {DUAL_ENTRY(_pthread_cleanup_pop)}, /* PJT_CLEANUP_POP */ {DUAL_ENTRY(_pthread_cleanup_push)}, /* PJT_CLEANUP_PUSH */ {DUAL_ENTRY(_pthread_cond_broadcast)}, /* PJT_COND_BROADCAST */ {DUAL_ENTRY(_pthread_cond_destroy)}, /* PJT_COND_DESTROY */ {DUAL_ENTRY(_pthread_cond_init)}, /* PJT_COND_INIT */ {DUAL_ENTRY(_pthread_cond_signal)}, /* PJT_COND_SIGNAL */ {DUAL_ENTRY(_pthread_cond_timedwait)}, /* PJT_COND_TIMEDWAIT */ {(pthread_func_t)__pthread_cond_wait, (pthread_func_t)_pthread_cond_wait}, /* PJT_COND_WAIT */ {DUAL_ENTRY(_pthread_detach)}, /* PJT_DETACH */ {DUAL_ENTRY(_pthread_equal)}, /* PJT_EQUAL */ {DUAL_ENTRY(_pthread_exit)}, /* PJT_EXIT */ {DUAL_ENTRY(_pthread_getspecific)}, /* PJT_GETSPECIFIC */ {DUAL_ENTRY(_pthread_join)}, /* PJT_JOIN */ {DUAL_ENTRY(_pthread_key_create)}, /* PJT_KEY_CREATE */ {DUAL_ENTRY(_pthread_key_delete)}, /* PJT_KEY_DELETE*/ {DUAL_ENTRY(_pthread_kill)}, /* PJT_KILL */ {DUAL_ENTRY(_pthread_main_np)}, /* PJT_MAIN_NP */ {DUAL_ENTRY(_pthread_mutexattr_destroy)}, /* PJT_MUTEXATTR_DESTROY */ {DUAL_ENTRY(_pthread_mutexattr_init)}, /* PJT_MUTEXATTR_INIT */ {DUAL_ENTRY(_pthread_mutexattr_settype)}, /* PJT_MUTEXATTR_SETTYPE */ {DUAL_ENTRY(_pthread_mutex_destroy)}, /* PJT_MUTEX_DESTROY */ {DUAL_ENTRY(_pthread_mutex_init)}, /* PJT_MUTEX_INIT */ {(pthread_func_t)__pthread_mutex_lock, (pthread_func_t)_pthread_mutex_lock}, /* PJT_MUTEX_LOCK */ {(pthread_func_t)__pthread_mutex_trylock, (pthread_func_t)_pthread_mutex_trylock},/* PJT_MUTEX_TRYLOCK */ {DUAL_ENTRY(_pthread_mutex_unlock)}, /* PJT_MUTEX_UNLOCK */ {DUAL_ENTRY(_pthread_once)}, /* PJT_ONCE */ {DUAL_ENTRY(_pthread_rwlock_destroy)}, /* PJT_RWLOCK_DESTROY */ {DUAL_ENTRY(_pthread_rwlock_init)}, /* PJT_RWLOCK_INIT */ {DUAL_ENTRY(_pthread_rwlock_rdlock)}, /* PJT_RWLOCK_RDLOCK */ {DUAL_ENTRY(_pthread_rwlock_tryrdlock)},/* PJT_RWLOCK_TRYRDLOCK */ {DUAL_ENTRY(_pthread_rwlock_trywrlock)},/* PJT_RWLOCK_TRYWRLOCK */ {DUAL_ENTRY(_pthread_rwlock_unlock)}, /* PJT_RWLOCK_UNLOCK */ {DUAL_ENTRY(_pthread_rwlock_wrlock)}, /* PJT_RWLOCK_WRLOCK */ {DUAL_ENTRY(_pthread_self)}, /* PJT_SELF */ {DUAL_ENTRY(_pthread_setcancelstate)}, /* PJT_SETCANCELSTATE */ {DUAL_ENTRY(_pthread_setcanceltype)}, /* PJT_SETCANCELTYPE */ {DUAL_ENTRY(_pthread_setspecific)}, /* PJT_SETSPECIFIC */ {DUAL_ENTRY(_pthread_sigmask)}, /* PJT_SIGMASK */ {DUAL_ENTRY(_pthread_testcancel)}, /* PJT_TESTCANCEL */ {DUAL_ENTRY(__pthread_cleanup_pop_imp)},/* PJT_CLEANUP_POP_IMP */ {DUAL_ENTRY(__pthread_cleanup_push_imp)},/* PJT_CLEANUP_PUSH_IMP */ {DUAL_ENTRY(_pthread_cancel_enter)}, /* PJT_CANCEL_ENTER */ {DUAL_ENTRY(_pthread_cancel_leave)} /* PJT_CANCEL_LEAVE */ }; static int init_once = 0; /* * For the shared version of the threads library, the above is sufficient. * But for the archive version of the library, we need a little bit more. * Namely, we must arrange for this particular module to be pulled in from * the archive library at link time. To accomplish that, we define and * initialize a variable, "_thread_autoinit_dummy_decl". This variable is * referenced (as an extern) from libc/stdlib/exit.c. This will always * create a need for this module, ensuring that it is present in the * executable. */ extern int _thread_autoinit_dummy_decl; int _thread_autoinit_dummy_decl = 0; void _thread_init_hack(void) { _libpthread_init(NULL); } /* * Threaded process initialization. * * This is only called under two conditions: * * 1) Some thread routines have detected that the library hasn't yet * been initialized (_thr_initial == NULL && curthread == NULL), or * * 2) An explicit call to reinitialize after a fork (indicated * by curthread != NULL) */ void _libpthread_init(struct pthread *curthread) { int fd, first, dlopened; /* Check if this function has already been called: */ if ((_thr_initial != NULL) && (curthread == NULL)) /* Only initialize the threaded application once. */ return; /* * Check the size of the jump table to make sure it is preset * with the correct number of entries. */ if (sizeof(jmp_table) != (sizeof(pthread_func_t) * PJT_MAX * 2)) PANIC("Thread jump table not properly initialized"); memcpy(__thr_jtable, jmp_table, sizeof(jmp_table)); __thr_interpose_libc(); /* * Check for the special case of this process running as * or in place of init as pid = 1: */ if ((_thr_pid = getpid()) == 1) { /* * Setup a new session for this process which is * assumed to be running as root. */ if (setsid() == -1) PANIC("Can't set session ID"); if (revoke(_PATH_CONSOLE) != 0) PANIC("Can't revoke console"); if ((fd = __sys_openat(AT_FDCWD, _PATH_CONSOLE, O_RDWR)) < 0) PANIC("Can't open console"); if (setlogin("root") == -1) PANIC("Can't set login to root"); if (_ioctl(fd, TIOCSCTTY, (char *) NULL) == -1) PANIC("Can't set controlling terminal"); } /* Initialize pthread private data. */ init_private(); /* Set the initial thread. */ if (curthread == NULL) { first = 1; /* Create and initialize the initial thread. */ curthread = _thr_alloc(NULL); if (curthread == NULL) PANIC("Can't allocate initial thread"); init_main_thread(curthread); } else { first = 0; } /* * Add the thread to the thread list queue. */ THR_LIST_ADD(curthread); _thread_active_threads = 1; /* Setup the thread specific data */ _tcb_set(curthread->tcb); if (first) { _thr_initial = curthread; dlopened = _rtld_is_dlopened(&_thread_autoinit_dummy_decl) != 0; _thr_signal_init(dlopened); if (_thread_event_mask & TD_CREATE) _thr_report_creation(curthread, curthread); /* * Always use our rtld lock implementation. * It is faster because it postpones signal handlers * instead of calling sigprocmask(2). */ _thr_rtld_init(); } } /* * This function and pthread_create() do a lot of the same things. * It'd be nice to consolidate the common stuff in one place. */ static void init_main_thread(struct pthread *thread) { struct sched_param sched_param; + int i; /* Setup the thread attributes. */ thr_self(&thread->tid); thread->attr = _pthread_attr_default; /* * Set up the thread stack. * * Create a red zone below the main stack. All other stacks * are constrained to a maximum size by the parameters * passed to mmap(), but this stack is only limited by * resource limits, so this stack needs an explicitly mapped * red zone to protect the thread stack that is just beyond. */ if (mmap(_usrstack - _thr_stack_initial - _thr_guard_default, _thr_guard_default, 0, MAP_ANON, -1, 0) == MAP_FAILED) PANIC("Cannot allocate red zone for initial thread"); /* * Mark the stack as an application supplied stack so that it * isn't deallocated. * * XXX - I'm not sure it would hurt anything to deallocate * the main thread stack because deallocation doesn't * actually free() it; it just puts it in the free * stack queue for later reuse. */ thread->attr.stackaddr_attr = _usrstack - _thr_stack_initial; thread->attr.stacksize_attr = _thr_stack_initial; thread->attr.guardsize_attr = _thr_guard_default; thread->attr.flags |= THR_STACK_USER; /* * Write a magic value to the thread structure * to help identify valid ones: */ thread->magic = THR_MAGIC; thread->cancel_enable = 1; thread->cancel_async = 0; - /* Initialize the mutex queue: */ - TAILQ_INIT(&thread->mutexq); - TAILQ_INIT(&thread->pp_mutexq); + /* Initialize the mutex queues */ + for (i = 0; i < TMQ_NITEMS; i++) + TAILQ_INIT(&thread->mq[i]); thread->state = PS_RUNNING; _thr_getscheduler(thread->tid, &thread->attr.sched_policy, &sched_param); thread->attr.prio = sched_param.sched_priority; #ifdef _PTHREAD_FORCED_UNWIND thread->unwind_stackend = _usrstack; #endif /* Others cleared to zero by thr_alloc() */ } static void init_private(void) { struct rlimit rlim; size_t len; int mib[2]; char *env, *env_bigstack, *env_splitstack; _thr_umutex_init(&_mutex_static_lock); _thr_umutex_init(&_cond_static_lock); _thr_umutex_init(&_rwlock_static_lock); _thr_umutex_init(&_keytable_lock); _thr_urwlock_init(&_thr_atfork_lock); _thr_umutex_init(&_thr_event_lock); _thr_umutex_init(&_suspend_all_lock); _thr_once_init(); _thr_spinlock_init(); _thr_list_init(); + __thr_pshared_init(); _thr_wake_addr_init(); _sleepq_init(); _single_thread = NULL; _suspend_all_waiters = 0; /* * Avoid reinitializing some things if they don't need to be, * e.g. after a fork(). */ if (init_once == 0) { /* Find the stack top */ mib[0] = CTL_KERN; mib[1] = KERN_USRSTACK; len = sizeof (_usrstack); if (sysctl(mib, 2, &_usrstack, &len, NULL, 0) == -1) PANIC("Cannot get kern.usrstack from sysctl"); env_bigstack = getenv("LIBPTHREAD_BIGSTACK_MAIN"); env_splitstack = getenv("LIBPTHREAD_SPLITSTACK_MAIN"); if (env_bigstack != NULL || env_splitstack == NULL) { if (getrlimit(RLIMIT_STACK, &rlim) == -1) PANIC("Cannot get stack rlimit"); _thr_stack_initial = rlim.rlim_cur; } len = sizeof(_thr_is_smp); sysctlbyname("kern.smp.cpus", &_thr_is_smp, &len, NULL, 0); _thr_is_smp = (_thr_is_smp > 1); _thr_page_size = getpagesize(); _thr_guard_default = _thr_page_size; _pthread_attr_default.guardsize_attr = _thr_guard_default; _pthread_attr_default.stacksize_attr = _thr_stack_default; env = getenv("LIBPTHREAD_SPINLOOPS"); if (env) _thr_spinloops = atoi(env); env = getenv("LIBPTHREAD_YIELDLOOPS"); if (env) _thr_yieldloops = atoi(env); env = getenv("LIBPTHREAD_QUEUE_FIFO"); if (env) _thr_queuefifo = atoi(env); TAILQ_INIT(&_thr_atfork_list); } init_once = 1; } Index: head/lib/libthr/thread/thr_mutex.c =================================================================== --- head/lib/libthr/thread/thr_mutex.c (revision 296161) +++ head/lib/libthr/thread/thr_mutex.c (revision 296162) @@ -1,798 +1,958 @@ /* * Copyright (c) 1995 John Birrell . * Copyright (c) 2006 David Xu . + * Copyright (c) 2015 The FreeBSD Foundation + * * All rights reserved. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by John Birrell. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "namespace.h" #include #include #include #include #include #include #include #include "un-namespace.h" #include "thr_private.h" -#if defined(_PTHREADS_INVARIANTS) -#define MUTEX_INIT_LINK(m) do { \ - (m)->m_qe.tqe_prev = NULL; \ - (m)->m_qe.tqe_next = NULL; \ -} while (0) -#define MUTEX_ASSERT_IS_OWNED(m) do { \ - if (__predict_false((m)->m_qe.tqe_prev == NULL))\ - PANIC("mutex is not on list"); \ -} while (0) -#define MUTEX_ASSERT_NOT_OWNED(m) do { \ - if (__predict_false((m)->m_qe.tqe_prev != NULL || \ - (m)->m_qe.tqe_next != NULL)) \ - PANIC("mutex is on list"); \ -} while (0) -#else -#define MUTEX_INIT_LINK(m) -#define MUTEX_ASSERT_IS_OWNED(m) -#define MUTEX_ASSERT_NOT_OWNED(m) -#endif - /* * For adaptive mutexes, how many times to spin doing trylock2 * before entering the kernel to block */ #define MUTEX_ADAPTIVE_SPINS 2000 /* * Prototypes */ int __pthread_mutex_init(pthread_mutex_t *mutex, const pthread_mutexattr_t *mutex_attr); int __pthread_mutex_trylock(pthread_mutex_t *mutex); int __pthread_mutex_lock(pthread_mutex_t *mutex); int __pthread_mutex_timedlock(pthread_mutex_t *mutex, const struct timespec *abstime); int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, void *(calloc_cb)(size_t, size_t)); int _pthread_mutex_getspinloops_np(pthread_mutex_t *mutex, int *count); int _pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count); int __pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count); int _pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count); int _pthread_mutex_getyieldloops_np(pthread_mutex_t *mutex, int *count); int __pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count); static int mutex_self_trylock(pthread_mutex_t); static int mutex_self_lock(pthread_mutex_t, const struct timespec *abstime); static int mutex_unlock_common(struct pthread_mutex *, int, int *); static int mutex_lock_sleep(struct pthread *, pthread_mutex_t, const struct timespec *); __weak_reference(__pthread_mutex_init, pthread_mutex_init); __strong_reference(__pthread_mutex_init, _pthread_mutex_init); __weak_reference(__pthread_mutex_lock, pthread_mutex_lock); __strong_reference(__pthread_mutex_lock, _pthread_mutex_lock); __weak_reference(__pthread_mutex_timedlock, pthread_mutex_timedlock); __strong_reference(__pthread_mutex_timedlock, _pthread_mutex_timedlock); __weak_reference(__pthread_mutex_trylock, pthread_mutex_trylock); __strong_reference(__pthread_mutex_trylock, _pthread_mutex_trylock); /* Single underscore versions provided for libc internal usage: */ /* No difference between libc and application usage of these: */ __weak_reference(_pthread_mutex_destroy, pthread_mutex_destroy); __weak_reference(_pthread_mutex_unlock, pthread_mutex_unlock); __weak_reference(_pthread_mutex_getprioceiling, pthread_mutex_getprioceiling); __weak_reference(_pthread_mutex_setprioceiling, pthread_mutex_setprioceiling); __weak_reference(__pthread_mutex_setspinloops_np, pthread_mutex_setspinloops_np); __strong_reference(__pthread_mutex_setspinloops_np, _pthread_mutex_setspinloops_np); __weak_reference(_pthread_mutex_getspinloops_np, pthread_mutex_getspinloops_np); __weak_reference(__pthread_mutex_setyieldloops_np, pthread_mutex_setyieldloops_np); __strong_reference(__pthread_mutex_setyieldloops_np, _pthread_mutex_setyieldloops_np); __weak_reference(_pthread_mutex_getyieldloops_np, pthread_mutex_getyieldloops_np); __weak_reference(_pthread_mutex_isowned_np, pthread_mutex_isowned_np); +static void +mutex_init_link(struct pthread_mutex *m) +{ + +#if defined(_PTHREADS_INVARIANTS) + m->m_qe.tqe_prev = NULL; + m->m_qe.tqe_next = NULL; + m->m_pqe.tqe_prev = NULL; + m->m_pqe.tqe_next = NULL; +#endif +} + +static void +mutex_assert_is_owned(struct pthread_mutex *m) +{ + +#if defined(_PTHREADS_INVARIANTS) + if (__predict_false(m->m_qe.tqe_prev == NULL)) + PANIC("mutex is not on list"); +#endif +} + +static void +mutex_assert_not_owned(struct pthread_mutex *m) +{ + +#if defined(_PTHREADS_INVARIANTS) + if (__predict_false(m->m_qe.tqe_prev != NULL || + m->m_qe.tqe_next != NULL)) + PANIC("mutex is on list"); +#endif +} + static int -mutex_init(pthread_mutex_t *mutex, - const struct pthread_mutex_attr *mutex_attr, - void *(calloc_cb)(size_t, size_t)) +is_pshared_mutex(struct pthread_mutex *m) { - const struct pthread_mutex_attr *attr; - struct pthread_mutex *pmutex; - if (mutex_attr == NULL) { - attr = &_pthread_mutexattr_default; - } else { - attr = mutex_attr; - if (attr->m_type < PTHREAD_MUTEX_ERRORCHECK || - attr->m_type >= PTHREAD_MUTEX_TYPE_MAX) - return (EINVAL); - if (attr->m_protocol < PTHREAD_PRIO_NONE || - attr->m_protocol > PTHREAD_PRIO_PROTECT) - return (EINVAL); - } - if ((pmutex = (pthread_mutex_t) - calloc_cb(1, sizeof(struct pthread_mutex))) == NULL) - return (ENOMEM); + return ((m->m_lock.m_flags & USYNC_PROCESS_SHARED) != 0); +} +static int +mutex_check_attr(const struct pthread_mutex_attr *attr) +{ + + if (attr->m_type < PTHREAD_MUTEX_ERRORCHECK || + attr->m_type >= PTHREAD_MUTEX_TYPE_MAX) + return (EINVAL); + if (attr->m_protocol < PTHREAD_PRIO_NONE || + attr->m_protocol > PTHREAD_PRIO_PROTECT) + return (EINVAL); + return (0); +} + +static void +mutex_init_body(struct pthread_mutex *pmutex, + const struct pthread_mutex_attr *attr) +{ + pmutex->m_flags = attr->m_type; - pmutex->m_owner = NULL; + pmutex->m_owner = 0; pmutex->m_count = 0; pmutex->m_spinloops = 0; pmutex->m_yieldloops = 0; - MUTEX_INIT_LINK(pmutex); - switch(attr->m_protocol) { + mutex_init_link(pmutex); + switch (attr->m_protocol) { case PTHREAD_PRIO_NONE: pmutex->m_lock.m_owner = UMUTEX_UNOWNED; pmutex->m_lock.m_flags = 0; break; case PTHREAD_PRIO_INHERIT: pmutex->m_lock.m_owner = UMUTEX_UNOWNED; pmutex->m_lock.m_flags = UMUTEX_PRIO_INHERIT; break; case PTHREAD_PRIO_PROTECT: pmutex->m_lock.m_owner = UMUTEX_CONTESTED; pmutex->m_lock.m_flags = UMUTEX_PRIO_PROTECT; pmutex->m_lock.m_ceilings[0] = attr->m_ceiling; break; } + if (attr->m_pshared == PTHREAD_PROCESS_SHARED) + pmutex->m_lock.m_flags |= USYNC_PROCESS_SHARED; if (PMUTEX_TYPE(pmutex->m_flags) == PTHREAD_MUTEX_ADAPTIVE_NP) { pmutex->m_spinloops = _thr_spinloops ? _thr_spinloops: MUTEX_ADAPTIVE_SPINS; pmutex->m_yieldloops = _thr_yieldloops; } +} +static int +mutex_init(pthread_mutex_t *mutex, + const struct pthread_mutex_attr *mutex_attr, + void *(calloc_cb)(size_t, size_t)) +{ + const struct pthread_mutex_attr *attr; + struct pthread_mutex *pmutex; + int error; + + if (mutex_attr == NULL) { + attr = &_pthread_mutexattr_default; + } else { + attr = mutex_attr; + error = mutex_check_attr(attr); + if (error != 0) + return (error); + } + if ((pmutex = (pthread_mutex_t) + calloc_cb(1, sizeof(struct pthread_mutex))) == NULL) + return (ENOMEM); + mutex_init_body(pmutex, attr); *mutex = pmutex; return (0); } static int init_static(struct pthread *thread, pthread_mutex_t *mutex) { int ret; THR_LOCK_ACQUIRE(thread, &_mutex_static_lock); if (*mutex == THR_MUTEX_INITIALIZER) ret = mutex_init(mutex, &_pthread_mutexattr_default, calloc); else if (*mutex == THR_ADAPTIVE_MUTEX_INITIALIZER) - ret = mutex_init(mutex, &_pthread_mutexattr_adaptive_default, calloc); + ret = mutex_init(mutex, &_pthread_mutexattr_adaptive_default, + calloc); else ret = 0; THR_LOCK_RELEASE(thread, &_mutex_static_lock); return (ret); } static void set_inherited_priority(struct pthread *curthread, struct pthread_mutex *m) { struct pthread_mutex *m2; - m2 = TAILQ_LAST(&curthread->pp_mutexq, mutex_queue); + m2 = TAILQ_LAST(&curthread->mq[TMQ_NORM_PP], mutex_queue); if (m2 != NULL) m->m_lock.m_ceilings[1] = m2->m_lock.m_ceilings[0]; else m->m_lock.m_ceilings[1] = -1; } int __pthread_mutex_init(pthread_mutex_t *mutex, const pthread_mutexattr_t *mutex_attr) { - return mutex_init(mutex, mutex_attr ? *mutex_attr : NULL, calloc); + struct pthread_mutex *pmtx; + int ret; + + if (mutex_attr != NULL) { + ret = mutex_check_attr(*mutex_attr); + if (ret != 0) + return (ret); + } + if (mutex_attr == NULL || + (*mutex_attr)->m_pshared == PTHREAD_PROCESS_PRIVATE) { + return (mutex_init(mutex, mutex_attr ? *mutex_attr : NULL, + calloc)); + } + pmtx = __thr_pshared_offpage(mutex, 1); + if (pmtx == NULL) + return (EFAULT); + *mutex = THR_PSHARED_PTR; + mutex_init_body(pmtx, *mutex_attr); + return (0); } /* This function is used internally by malloc. */ int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, void *(calloc_cb)(size_t, size_t)) { static const struct pthread_mutex_attr attr = { .m_type = PTHREAD_MUTEX_NORMAL, .m_protocol = PTHREAD_PRIO_NONE, - .m_ceiling = 0 + .m_ceiling = 0, + .m_pshared = PTHREAD_PROCESS_PRIVATE, }; int ret; ret = mutex_init(mutex, &attr, calloc_cb); if (ret == 0) (*mutex)->m_flags |= PMUTEX_FLAG_PRIVATE; return (ret); } -void -_mutex_fork(struct pthread *curthread) +/* + * Fix mutex ownership for child process. + * + * Process private mutex ownership is transmitted from the forking + * thread to the child process. + * + * Process shared mutex should not be inherited because owner is + * forking thread which is in parent process, they are removed from + * the owned mutex list. + */ +static void +queue_fork(struct pthread *curthread, struct mutex_queue *q, + struct mutex_queue *qp, uint bit) { struct pthread_mutex *m; - /* - * Fix mutex ownership for child process. - * note that process shared mutex should not - * be inherited because owner is forking thread - * which is in parent process, they should be - * removed from the owned mutex list, current, - * process shared mutex is not supported, so I - * am not worried. - */ + TAILQ_INIT(q); + TAILQ_FOREACH(m, qp, m_pqe) { + TAILQ_INSERT_TAIL(q, m, m_qe); + m->m_lock.m_owner = TID(curthread) | bit; + m->m_owner = TID(curthread); + } +} - TAILQ_FOREACH(m, &curthread->mutexq, m_qe) - m->m_lock.m_owner = TID(curthread); - TAILQ_FOREACH(m, &curthread->pp_mutexq, m_qe) - m->m_lock.m_owner = TID(curthread) | UMUTEX_CONTESTED; +void +_mutex_fork(struct pthread *curthread) +{ + + queue_fork(curthread, &curthread->mq[TMQ_NORM], + &curthread->mq[TMQ_NORM_PRIV], 0); + queue_fork(curthread, &curthread->mq[TMQ_NORM_PP], + &curthread->mq[TMQ_NORM_PP_PRIV], UMUTEX_CONTESTED); } int _pthread_mutex_destroy(pthread_mutex_t *mutex) { - pthread_mutex_t m; + pthread_mutex_t m, m1; int ret; m = *mutex; if (m < THR_MUTEX_DESTROYED) { ret = 0; } else if (m == THR_MUTEX_DESTROYED) { ret = EINVAL; } else { - if (m->m_owner != NULL) { + if (m == THR_PSHARED_PTR) { + m1 = __thr_pshared_offpage(mutex, 0); + if (m1 != NULL) { + mutex_assert_not_owned(m1); + __thr_pshared_destroy(mutex); + } + *mutex = THR_MUTEX_DESTROYED; + return (0); + } + if (m->m_owner != 0) { ret = EBUSY; } else { *mutex = THR_MUTEX_DESTROYED; - MUTEX_ASSERT_NOT_OWNED(m); + mutex_assert_not_owned(m); free(m); ret = 0; } } return (ret); } -#define ENQUEUE_MUTEX(curthread, m) \ - do { \ - (m)->m_owner = curthread; \ - /* Add to the list of owned mutexes: */ \ - MUTEX_ASSERT_NOT_OWNED((m)); \ - if (((m)->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) \ - TAILQ_INSERT_TAIL(&curthread->mutexq, (m), m_qe);\ - else \ - TAILQ_INSERT_TAIL(&curthread->pp_mutexq, (m), m_qe);\ - } while (0) +static int +mutex_qidx(struct pthread_mutex *m) +{ -#define DEQUEUE_MUTEX(curthread, m) \ - (m)->m_owner = NULL; \ - MUTEX_ASSERT_IS_OWNED(m); \ - if (__predict_true(((m)->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)) \ - TAILQ_REMOVE(&curthread->mutexq, (m), m_qe); \ - else { \ - TAILQ_REMOVE(&curthread->pp_mutexq, (m), m_qe); \ - set_inherited_priority(curthread, m); \ - } \ - MUTEX_INIT_LINK(m); + if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) + return (TMQ_NORM); + return (TMQ_NORM_PP); +} -#define CHECK_AND_INIT_MUTEX \ - if (__predict_false((m = *mutex) <= THR_MUTEX_DESTROYED)) { \ - if (m == THR_MUTEX_DESTROYED) \ - return (EINVAL); \ - int ret; \ - ret = init_static(_get_curthread(), mutex); \ - if (ret) \ - return (ret); \ - m = *mutex; \ - } +static void +enqueue_mutex(struct pthread *curthread, struct pthread_mutex *m) +{ + int qidx; + m->m_owner = TID(curthread); + /* Add to the list of owned mutexes: */ + mutex_assert_not_owned(m); + qidx = mutex_qidx(m); + TAILQ_INSERT_TAIL(&curthread->mq[qidx], m, m_qe); + if (!is_pshared_mutex(m)) + TAILQ_INSERT_TAIL(&curthread->mq[qidx + 1], m, m_pqe); +} + +static void +dequeue_mutex(struct pthread *curthread, struct pthread_mutex *m) +{ + int qidx; + + m->m_owner = 0; + mutex_assert_is_owned(m); + qidx = mutex_qidx(m); + TAILQ_REMOVE(&curthread->mq[qidx], m, m_qe); + if (!is_pshared_mutex(m)) + TAILQ_REMOVE(&curthread->mq[qidx + 1], m, m_pqe); + if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) != 0) + set_inherited_priority(curthread, m); + mutex_init_link(m); +} + static int -mutex_trylock_common(pthread_mutex_t *mutex) +check_and_init_mutex(pthread_mutex_t *mutex, struct pthread_mutex **m) { - struct pthread *curthread = _get_curthread(); - struct pthread_mutex *m = *mutex; + int ret; + + *m = *mutex; + ret = 0; + if (*m == THR_PSHARED_PTR) { + *m = __thr_pshared_offpage(mutex, 0); + if (*m == NULL) + ret = EINVAL; + } else if (__predict_false(*m <= THR_MUTEX_DESTROYED)) { + if (*m == THR_MUTEX_DESTROYED) { + ret = EINVAL; + } else { + ret = init_static(_get_curthread(), mutex); + if (ret == 0) + *m = *mutex; + } + } + return (ret); +} + +int +__pthread_mutex_trylock(pthread_mutex_t *mutex) +{ + struct pthread *curthread; + struct pthread_mutex *m; uint32_t id; int ret; + ret = check_and_init_mutex(mutex, &m); + if (ret != 0) + return (ret); + curthread = _get_curthread(); id = TID(curthread); if (m->m_flags & PMUTEX_FLAG_PRIVATE) THR_CRITICAL_ENTER(curthread); ret = _thr_umutex_trylock(&m->m_lock, id); if (__predict_true(ret == 0)) { - ENQUEUE_MUTEX(curthread, m); - } else if (m->m_owner == curthread) { + enqueue_mutex(curthread, m); + } else if (m->m_owner == id) { ret = mutex_self_trylock(m); } /* else {} */ if (ret && (m->m_flags & PMUTEX_FLAG_PRIVATE)) THR_CRITICAL_LEAVE(curthread); return (ret); } -int -__pthread_mutex_trylock(pthread_mutex_t *mutex) -{ - struct pthread_mutex *m; - - CHECK_AND_INIT_MUTEX - - return (mutex_trylock_common(mutex)); -} - static int mutex_lock_sleep(struct pthread *curthread, struct pthread_mutex *m, const struct timespec *abstime) { uint32_t id, owner; int count; int ret; - if (m->m_owner == curthread) - return mutex_self_lock(m, abstime); - id = TID(curthread); + if (m->m_owner == id) + return (mutex_self_lock(m, abstime)); + /* * For adaptive mutexes, spin for a bit in the expectation * that if the application requests this mutex type then * the lock is likely to be released quickly and it is * faster than entering the kernel */ if (__predict_false( (m->m_lock.m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0)) goto sleep_in_kernel; if (!_thr_is_smp) goto yield_loop; count = m->m_spinloops; while (count--) { owner = m->m_lock.m_owner; if ((owner & ~UMUTEX_CONTESTED) == 0) { if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, id|owner)) { ret = 0; goto done; } } CPU_SPINWAIT; } yield_loop: count = m->m_yieldloops; while (count--) { _sched_yield(); owner = m->m_lock.m_owner; if ((owner & ~UMUTEX_CONTESTED) == 0) { if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, id|owner)) { ret = 0; goto done; } } } sleep_in_kernel: if (abstime == NULL) { ret = __thr_umutex_lock(&m->m_lock, id); } else if (__predict_false( abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000)) { ret = EINVAL; } else { ret = __thr_umutex_timedlock(&m->m_lock, id, abstime); } done: if (ret == 0) - ENQUEUE_MUTEX(curthread, m); + enqueue_mutex(curthread, m); return (ret); } static inline int mutex_lock_common(struct pthread_mutex *m, const struct timespec *abstime, int cvattach) { struct pthread *curthread = _get_curthread(); int ret; if (!cvattach && m->m_flags & PMUTEX_FLAG_PRIVATE) THR_CRITICAL_ENTER(curthread); if (_thr_umutex_trylock2(&m->m_lock, TID(curthread)) == 0) { - ENQUEUE_MUTEX(curthread, m); + enqueue_mutex(curthread, m); ret = 0; } else { ret = mutex_lock_sleep(curthread, m, abstime); } if (ret && (m->m_flags & PMUTEX_FLAG_PRIVATE) && !cvattach) THR_CRITICAL_LEAVE(curthread); return (ret); } int __pthread_mutex_lock(pthread_mutex_t *mutex) { - struct pthread_mutex *m; + struct pthread_mutex *m; + int ret; _thr_check_init(); - - CHECK_AND_INIT_MUTEX - - return (mutex_lock_common(m, NULL, 0)); + ret = check_and_init_mutex(mutex, &m); + if (ret == 0) + ret = mutex_lock_common(m, NULL, 0); + return (ret); } int -__pthread_mutex_timedlock(pthread_mutex_t *mutex, const struct timespec *abstime) +__pthread_mutex_timedlock(pthread_mutex_t *mutex, + const struct timespec *abstime) { - struct pthread_mutex *m; + struct pthread_mutex *m; + int ret; _thr_check_init(); - - CHECK_AND_INIT_MUTEX - - return (mutex_lock_common(m, abstime, 0)); + ret = check_and_init_mutex(mutex, &m); + if (ret == 0) + ret = mutex_lock_common(m, abstime, 0); + return (ret); } int _pthread_mutex_unlock(pthread_mutex_t *mutex) { struct pthread_mutex *mp; - mp = *mutex; + if (*mutex == THR_PSHARED_PTR) { + mp = __thr_pshared_offpage(mutex, 0); + if (mp == NULL) + return (EINVAL); + } else { + mp = *mutex; + } return (mutex_unlock_common(mp, 0, NULL)); } int _mutex_cv_lock(struct pthread_mutex *m, int count) { int error; error = mutex_lock_common(m, NULL, 1); if (error == 0) m->m_count = count; return (error); } int _mutex_cv_unlock(struct pthread_mutex *m, int *count, int *defer) { /* * Clear the count in case this is a recursive mutex. */ *count = m->m_count; m->m_count = 0; (void)mutex_unlock_common(m, 1, defer); return (0); } int _mutex_cv_attach(struct pthread_mutex *m, int count) { struct pthread *curthread = _get_curthread(); - ENQUEUE_MUTEX(curthread, m); + enqueue_mutex(curthread, m); m->m_count = count; return (0); } int _mutex_cv_detach(struct pthread_mutex *mp, int *recurse) { struct pthread *curthread = _get_curthread(); int defered; int error; if ((error = _mutex_owned(curthread, mp)) != 0) return (error); /* * Clear the count in case this is a recursive mutex. */ *recurse = mp->m_count; mp->m_count = 0; - DEQUEUE_MUTEX(curthread, mp); + dequeue_mutex(curthread, mp); /* Will this happen in real-world ? */ if ((mp->m_flags & PMUTEX_FLAG_DEFERED) != 0) { defered = 1; mp->m_flags &= ~PMUTEX_FLAG_DEFERED; } else defered = 0; if (defered) { _thr_wake_all(curthread->defer_waiters, curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } return (0); } static int mutex_self_trylock(struct pthread_mutex *m) { int ret; switch (PMUTEX_TYPE(m->m_flags)) { case PTHREAD_MUTEX_ERRORCHECK: case PTHREAD_MUTEX_NORMAL: case PTHREAD_MUTEX_ADAPTIVE_NP: ret = EBUSY; break; case PTHREAD_MUTEX_RECURSIVE: /* Increment the lock count: */ if (m->m_count + 1 > 0) { m->m_count++; ret = 0; } else ret = EAGAIN; break; default: /* Trap invalid mutex types; */ ret = EINVAL; } return (ret); } static int mutex_self_lock(struct pthread_mutex *m, const struct timespec *abstime) { struct timespec ts1, ts2; int ret; switch (PMUTEX_TYPE(m->m_flags)) { case PTHREAD_MUTEX_ERRORCHECK: case PTHREAD_MUTEX_ADAPTIVE_NP: if (abstime) { if (abstime->tv_sec < 0 || abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) { ret = EINVAL; } else { clock_gettime(CLOCK_REALTIME, &ts1); TIMESPEC_SUB(&ts2, abstime, &ts1); __sys_nanosleep(&ts2, NULL); ret = ETIMEDOUT; } } else { /* * POSIX specifies that mutexes should return * EDEADLK if a recursive lock is detected. */ ret = EDEADLK; } break; case PTHREAD_MUTEX_NORMAL: /* * What SS2 define as a 'normal' mutex. Intentionally * deadlock on attempts to get a lock you already own. */ ret = 0; if (abstime) { if (abstime->tv_sec < 0 || abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) { ret = EINVAL; } else { clock_gettime(CLOCK_REALTIME, &ts1); TIMESPEC_SUB(&ts2, abstime, &ts1); __sys_nanosleep(&ts2, NULL); ret = ETIMEDOUT; } } else { ts1.tv_sec = 30; ts1.tv_nsec = 0; for (;;) __sys_nanosleep(&ts1, NULL); } break; case PTHREAD_MUTEX_RECURSIVE: /* Increment the lock count: */ if (m->m_count + 1 > 0) { m->m_count++; ret = 0; } else ret = EAGAIN; break; default: /* Trap invalid mutex types; */ ret = EINVAL; } return (ret); } static int mutex_unlock_common(struct pthread_mutex *m, int cv, int *mtx_defer) { struct pthread *curthread = _get_curthread(); uint32_t id; int defered, error; if (__predict_false(m <= THR_MUTEX_DESTROYED)) { if (m == THR_MUTEX_DESTROYED) return (EINVAL); return (EPERM); } + id = TID(curthread); + /* * Check if the running thread is not the owner of the mutex. */ - if (__predict_false(m->m_owner != curthread)) + if (__predict_false(m->m_owner != id)) return (EPERM); error = 0; - id = TID(curthread); if (__predict_false( PMUTEX_TYPE(m->m_flags) == PTHREAD_MUTEX_RECURSIVE && m->m_count > 0)) { m->m_count--; } else { if ((m->m_flags & PMUTEX_FLAG_DEFERED) != 0) { defered = 1; m->m_flags &= ~PMUTEX_FLAG_DEFERED; } else defered = 0; - DEQUEUE_MUTEX(curthread, m); + dequeue_mutex(curthread, m); error = _thr_umutex_unlock2(&m->m_lock, id, mtx_defer); if (mtx_defer == NULL && defered) { _thr_wake_all(curthread->defer_waiters, curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } } if (!cv && m->m_flags & PMUTEX_FLAG_PRIVATE) THR_CRITICAL_LEAVE(curthread); return (error); } int _pthread_mutex_getprioceiling(pthread_mutex_t *mutex, - int *prioceiling) + int *prioceiling) { struct pthread_mutex *m; - int ret; - m = *mutex; - if ((m <= THR_MUTEX_DESTROYED) || - (m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) - ret = EINVAL; - else { - *prioceiling = m->m_lock.m_ceilings[0]; - ret = 0; + if (*mutex == THR_PSHARED_PTR) { + m = __thr_pshared_offpage(mutex, 0); + if (m == NULL) + return (EINVAL); + } else { + m = *mutex; + if (m <= THR_MUTEX_DESTROYED) + return (EINVAL); } - - return (ret); + if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) + return (EINVAL); + *prioceiling = m->m_lock.m_ceilings[0]; + return (0); } int _pthread_mutex_setprioceiling(pthread_mutex_t *mutex, - int ceiling, int *old_ceiling) + int ceiling, int *old_ceiling) { - struct pthread *curthread = _get_curthread(); + struct pthread *curthread; struct pthread_mutex *m, *m1, *m2; + struct mutex_queue *q, *qp; int ret; - m = *mutex; - if ((m <= THR_MUTEX_DESTROYED) || - (m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) + if (*mutex == THR_PSHARED_PTR) { + m = __thr_pshared_offpage(mutex, 0); + if (m == NULL) + return (EINVAL); + } else { + m = *mutex; + if (m <= THR_MUTEX_DESTROYED) + return (EINVAL); + } + if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); ret = __thr_umutex_set_ceiling(&m->m_lock, ceiling, old_ceiling); if (ret != 0) return (ret); - if (m->m_owner == curthread) { - MUTEX_ASSERT_IS_OWNED(m); + curthread = _get_curthread(); + if (m->m_owner == TID(curthread)) { + mutex_assert_is_owned(m); m1 = TAILQ_PREV(m, mutex_queue, m_qe); m2 = TAILQ_NEXT(m, m_qe); if ((m1 != NULL && m1->m_lock.m_ceilings[0] > (u_int)ceiling) || (m2 != NULL && m2->m_lock.m_ceilings[0] < (u_int)ceiling)) { - TAILQ_REMOVE(&curthread->pp_mutexq, m, m_qe); - TAILQ_FOREACH(m2, &curthread->pp_mutexq, m_qe) { + q = &curthread->mq[TMQ_NORM_PP]; + qp = &curthread->mq[TMQ_NORM_PP_PRIV]; + TAILQ_REMOVE(q, m, m_qe); + if (!is_pshared_mutex(m)) + TAILQ_REMOVE(qp, m, m_pqe); + TAILQ_FOREACH(m2, q, m_qe) { if (m2->m_lock.m_ceilings[0] > (u_int)ceiling) { TAILQ_INSERT_BEFORE(m2, m, m_qe); + if (!is_pshared_mutex(m)) { + while (m2 != NULL && + is_pshared_mutex(m2)) { + m2 = TAILQ_PREV(m2, + mutex_queue, m_qe); + } + if (m2 == NULL) { + TAILQ_INSERT_HEAD(qp, + m, m_pqe); + } else { + TAILQ_INSERT_BEFORE(m2, + m, m_pqe); + } + } return (0); } } - TAILQ_INSERT_TAIL(&curthread->pp_mutexq, m, m_qe); + TAILQ_INSERT_TAIL(q, m, m_qe); + if (!is_pshared_mutex(m)) + TAILQ_INSERT_TAIL(qp, m, m_pqe); } } return (0); } int _pthread_mutex_getspinloops_np(pthread_mutex_t *mutex, int *count) { - struct pthread_mutex *m; + struct pthread_mutex *m; + int ret; - CHECK_AND_INIT_MUTEX - - *count = m->m_spinloops; - return (0); + ret = check_and_init_mutex(mutex, &m); + if (ret == 0) + *count = m->m_spinloops; + return (ret); } int __pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count) { - struct pthread_mutex *m; + struct pthread_mutex *m; + int ret; - CHECK_AND_INIT_MUTEX - - m->m_spinloops = count; - return (0); + ret = check_and_init_mutex(mutex, &m); + if (ret == 0) + m->m_spinloops = count; + return (ret); } int _pthread_mutex_getyieldloops_np(pthread_mutex_t *mutex, int *count) { - struct pthread_mutex *m; + struct pthread_mutex *m; + int ret; - CHECK_AND_INIT_MUTEX - - *count = m->m_yieldloops; - return (0); + ret = check_and_init_mutex(mutex, &m); + if (ret == 0) + *count = m->m_yieldloops; + return (ret); } int __pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count) { - struct pthread_mutex *m; + struct pthread_mutex *m; + int ret; - CHECK_AND_INIT_MUTEX - - m->m_yieldloops = count; + ret = check_and_init_mutex(mutex, &m); + if (ret == 0) + m->m_yieldloops = count; return (0); } int _pthread_mutex_isowned_np(pthread_mutex_t *mutex) { struct pthread_mutex *m; - m = *mutex; - if (m <= THR_MUTEX_DESTROYED) - return (0); - return (m->m_owner == _get_curthread()); + if (*mutex == THR_PSHARED_PTR) { + m = __thr_pshared_offpage(mutex, 0); + if (m == NULL) + return (0); + } else { + m = *mutex; + if (m <= THR_MUTEX_DESTROYED) + return (0); + } + return (m->m_owner == TID(_get_curthread())); } int _mutex_owned(struct pthread *curthread, const struct pthread_mutex *mp) { if (__predict_false(mp <= THR_MUTEX_DESTROYED)) { if (mp == THR_MUTEX_DESTROYED) return (EINVAL); return (EPERM); } - if (mp->m_owner != curthread) + if (mp->m_owner != TID(curthread)) return (EPERM); return (0); } Index: head/lib/libthr/thread/thr_mutexattr.c =================================================================== --- head/lib/libthr/thread/thr_mutexattr.c (revision 296161) +++ head/lib/libthr/thread/thr_mutexattr.c (revision 296162) @@ -1,255 +1,252 @@ /* * Copyright (c) 1996 Jeffrey Hsu . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Copyright (c) 1997 John Birrell . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include "namespace.h" #include #include #include #include #include #include "un-namespace.h" #include "thr_private.h" __weak_reference(_pthread_mutexattr_init, pthread_mutexattr_init); __weak_reference(_pthread_mutexattr_setkind_np, pthread_mutexattr_setkind_np); __weak_reference(_pthread_mutexattr_getkind_np, pthread_mutexattr_getkind_np); __weak_reference(_pthread_mutexattr_gettype, pthread_mutexattr_gettype); __weak_reference(_pthread_mutexattr_settype, pthread_mutexattr_settype); __weak_reference(_pthread_mutexattr_destroy, pthread_mutexattr_destroy); __weak_reference(_pthread_mutexattr_getpshared, pthread_mutexattr_getpshared); __weak_reference(_pthread_mutexattr_setpshared, pthread_mutexattr_setpshared); __weak_reference(_pthread_mutexattr_getprotocol, pthread_mutexattr_getprotocol); __weak_reference(_pthread_mutexattr_setprotocol, pthread_mutexattr_setprotocol); __weak_reference(_pthread_mutexattr_getprioceiling, pthread_mutexattr_getprioceiling); __weak_reference(_pthread_mutexattr_setprioceiling, pthread_mutexattr_setprioceiling); int _pthread_mutexattr_init(pthread_mutexattr_t *attr) { int ret; pthread_mutexattr_t pattr; if ((pattr = (pthread_mutexattr_t) malloc(sizeof(struct pthread_mutex_attr))) == NULL) { ret = ENOMEM; } else { memcpy(pattr, &_pthread_mutexattr_default, sizeof(struct pthread_mutex_attr)); *attr = pattr; ret = 0; } return (ret); } int _pthread_mutexattr_setkind_np(pthread_mutexattr_t *attr, int kind) { int ret; if (attr == NULL || *attr == NULL) { errno = EINVAL; ret = -1; } else { (*attr)->m_type = kind; ret = 0; } return(ret); } int _pthread_mutexattr_getkind_np(pthread_mutexattr_t attr) { int ret; if (attr == NULL) { errno = EINVAL; ret = -1; } else { ret = attr->m_type; } return(ret); } int _pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type) { int ret; if (attr == NULL || *attr == NULL || type >= PTHREAD_MUTEX_TYPE_MAX) { ret = EINVAL; } else { (*attr)->m_type = type; ret = 0; } return(ret); } int _pthread_mutexattr_gettype(pthread_mutexattr_t *attr, int *type) { int ret; if (attr == NULL || *attr == NULL || (*attr)->m_type >= PTHREAD_MUTEX_TYPE_MAX) { ret = EINVAL; } else { *type = (*attr)->m_type; ret = 0; } return ret; } int _pthread_mutexattr_destroy(pthread_mutexattr_t *attr) { int ret; if (attr == NULL || *attr == NULL) { ret = EINVAL; } else { free(*attr); *attr = NULL; ret = 0; } return(ret); } int _pthread_mutexattr_getpshared(const pthread_mutexattr_t *attr, int *pshared) { if (attr == NULL || *attr == NULL) return (EINVAL); - - *pshared = PTHREAD_PROCESS_PRIVATE; + *pshared = (*attr)->m_pshared; return (0); } int _pthread_mutexattr_setpshared(pthread_mutexattr_t *attr, int pshared) { - if (attr == NULL || *attr == NULL) + if (attr == NULL || *attr == NULL || + (pshared != PTHREAD_PROCESS_PRIVATE && + pshared != PTHREAD_PROCESS_SHARED)) return (EINVAL); - - /* Only PTHREAD_PROCESS_PRIVATE is supported. */ - if (pshared != PTHREAD_PROCESS_PRIVATE) - return (EINVAL); - + (*attr)->m_pshared = pshared; return (0); } int _pthread_mutexattr_getprotocol(pthread_mutexattr_t *mattr, int *protocol) { int ret = 0; if ((mattr == NULL) || (*mattr == NULL)) ret = EINVAL; else *protocol = (*mattr)->m_protocol; return(ret); } int _pthread_mutexattr_setprotocol(pthread_mutexattr_t *mattr, int protocol) { int ret = 0; if ((mattr == NULL) || (*mattr == NULL) || (protocol < PTHREAD_PRIO_NONE) || (protocol > PTHREAD_PRIO_PROTECT)) ret = EINVAL; else { (*mattr)->m_protocol = protocol; (*mattr)->m_ceiling = THR_MAX_RR_PRIORITY; } return(ret); } int _pthread_mutexattr_getprioceiling(pthread_mutexattr_t *mattr, int *prioceiling) { int ret = 0; if ((mattr == NULL) || (*mattr == NULL)) ret = EINVAL; else if ((*mattr)->m_protocol != PTHREAD_PRIO_PROTECT) ret = EINVAL; else *prioceiling = (*mattr)->m_ceiling; return(ret); } int _pthread_mutexattr_setprioceiling(pthread_mutexattr_t *mattr, int prioceiling) { int ret = 0; if ((mattr == NULL) || (*mattr == NULL)) ret = EINVAL; else if ((*mattr)->m_protocol != PTHREAD_PRIO_PROTECT) ret = EINVAL; else (*mattr)->m_ceiling = prioceiling; return(ret); } Index: head/lib/libthr/thread/thr_private.h =================================================================== --- head/lib/libthr/thread/thr_private.h (revision 296161) +++ head/lib/libthr/thread/thr_private.h (revision 296162) @@ -1,941 +1,958 @@ /* * Copyright (C) 2005 Daniel M. Eischen * Copyright (c) 2005 David Xu * Copyright (c) 1995-1998 John Birrell . * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _THR_PRIVATE_H #define _THR_PRIVATE_H /* * Include files. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SYM_FB10(sym) __CONCAT(sym, _fb10) #define SYM_FBP10(sym) __CONCAT(sym, _fbp10) #define WEAK_REF(sym, alias) __weak_reference(sym, alias) #define SYM_COMPAT(sym, impl, ver) __sym_compat(sym, impl, ver) #define SYM_DEFAULT(sym, impl, ver) __sym_default(sym, impl, ver) #define FB10_COMPAT(func, sym) \ WEAK_REF(func, SYM_FB10(sym)); \ SYM_COMPAT(sym, SYM_FB10(sym), FBSD_1.0) #define FB10_COMPAT_PRIVATE(func, sym) \ WEAK_REF(func, SYM_FBP10(sym)); \ SYM_DEFAULT(sym, SYM_FBP10(sym), FBSDprivate_1.0) #include "pthread_md.h" #include "thr_umtx.h" #include "thread_db.h" #ifdef _PTHREAD_FORCED_UNWIND #define _BSD_SOURCE #include #endif typedef TAILQ_HEAD(pthreadlist, pthread) pthreadlist; typedef TAILQ_HEAD(atfork_head, pthread_atfork) atfork_head; TAILQ_HEAD(mutex_queue, pthread_mutex); /* Signal to do cancellation */ #define SIGCANCEL SIGTHR /* * Kernel fatal error handler macro. */ #define PANIC(string) _thread_exit(__FILE__,__LINE__,string) /* Output debug messages like this: */ #define stdout_debug(args...) _thread_printf(STDOUT_FILENO, ##args) #define stderr_debug(args...) _thread_printf(STDERR_FILENO, ##args) #ifdef _PTHREADS_INVARIANTS #define THR_ASSERT(cond, msg) do { \ if (__predict_false(!(cond))) \ PANIC(msg); \ } while (0) #else #define THR_ASSERT(cond, msg) #endif #ifdef PIC # define STATIC_LIB_REQUIRE(name) #else # define STATIC_LIB_REQUIRE(name) __asm (".globl " #name) #endif #define TIMESPEC_ADD(dst, src, val) \ do { \ (dst)->tv_sec = (src)->tv_sec + (val)->tv_sec; \ (dst)->tv_nsec = (src)->tv_nsec + (val)->tv_nsec; \ if ((dst)->tv_nsec >= 1000000000) { \ (dst)->tv_sec++; \ (dst)->tv_nsec -= 1000000000; \ } \ } while (0) #define TIMESPEC_SUB(dst, src, val) \ do { \ (dst)->tv_sec = (src)->tv_sec - (val)->tv_sec; \ (dst)->tv_nsec = (src)->tv_nsec - (val)->tv_nsec; \ if ((dst)->tv_nsec < 0) { \ (dst)->tv_sec--; \ (dst)->tv_nsec += 1000000000; \ } \ } while (0) +/* Magic cookie set for shared pthread locks and cv's pointers */ +#define THR_PSHARED_PTR \ + ((void *)(uintptr_t)((1ULL << (NBBY * sizeof(long) - 1)) | 1)) + /* XXX These values should be same as those defined in pthread.h */ #define THR_MUTEX_INITIALIZER ((struct pthread_mutex *)NULL) #define THR_ADAPTIVE_MUTEX_INITIALIZER ((struct pthread_mutex *)1) #define THR_MUTEX_DESTROYED ((struct pthread_mutex *)2) #define THR_COND_INITIALIZER ((struct pthread_cond *)NULL) #define THR_COND_DESTROYED ((struct pthread_cond *)1) #define THR_RWLOCK_INITIALIZER ((struct pthread_rwlock *)NULL) #define THR_RWLOCK_DESTROYED ((struct pthread_rwlock *)1) #define PMUTEX_FLAG_TYPE_MASK 0x0ff #define PMUTEX_FLAG_PRIVATE 0x100 #define PMUTEX_FLAG_DEFERED 0x200 #define PMUTEX_TYPE(mtxflags) ((mtxflags) & PMUTEX_FLAG_TYPE_MASK) #define MAX_DEFER_WAITERS 50 struct pthread_mutex { /* * Lock for accesses to this structure. */ struct umutex m_lock; int m_flags; - struct pthread *m_owner; + uint32_t m_owner; int m_count; int m_spinloops; int m_yieldloops; /* - * Link for all mutexes a thread currently owns. + * Link for all mutexes a thread currently owns, of the same + * prio type. */ TAILQ_ENTRY(pthread_mutex) m_qe; + /* Link for all private mutexes a thread currently owns. */ + TAILQ_ENTRY(pthread_mutex) m_pqe; }; struct pthread_mutex_attr { enum pthread_mutextype m_type; int m_protocol; int m_ceiling; + int m_pshared; }; #define PTHREAD_MUTEXATTR_STATIC_INITIALIZER \ { PTHREAD_MUTEX_DEFAULT, PTHREAD_PRIO_NONE, 0, MUTEX_FLAGS_PRIVATE } struct pthread_cond { __uint32_t __has_user_waiters; __uint32_t __has_kern_waiters; __uint32_t __flags; __uint32_t __clock_id; }; struct pthread_cond_attr { int c_pshared; int c_clockid; }; struct pthread_barrier { struct umutex b_lock; struct ucond b_cv; int64_t b_cycle; int b_count; int b_waiters; int b_refcount; int b_destroying; }; struct pthread_barrierattr { int pshared; }; struct pthread_spinlock { struct umutex s_lock; }; /* * Flags for condition variables. */ #define COND_FLAGS_PRIVATE 0x01 #define COND_FLAGS_INITED 0x02 #define COND_FLAGS_BUSY 0x04 /* * Cleanup definitions. */ struct pthread_cleanup { struct pthread_cleanup *prev; void (*routine)(void *); void *routine_arg; int onheap; }; #define THR_CLEANUP_PUSH(td, func, arg) { \ struct pthread_cleanup __cup; \ \ __cup.routine = func; \ __cup.routine_arg = arg; \ __cup.onheap = 0; \ __cup.prev = (td)->cleanup; \ (td)->cleanup = &__cup; #define THR_CLEANUP_POP(td, exec) \ (td)->cleanup = __cup.prev; \ if ((exec) != 0) \ __cup.routine(__cup.routine_arg); \ } struct pthread_atfork { TAILQ_ENTRY(pthread_atfork) qe; void (*prepare)(void); void (*parent)(void); void (*child)(void); }; struct pthread_attr { #define pthread_attr_start_copy sched_policy int sched_policy; int sched_inherit; int prio; int suspend; #define THR_STACK_USER 0x100 /* 0xFF reserved for */ int flags; void *stackaddr_attr; size_t stacksize_attr; size_t guardsize_attr; #define pthread_attr_end_copy cpuset cpuset_t *cpuset; size_t cpusetsize; }; struct wake_addr { struct wake_addr *link; unsigned int value; char pad[12]; }; struct sleepqueue { TAILQ_HEAD(, pthread) sq_blocked; SLIST_HEAD(, sleepqueue) sq_freeq; LIST_ENTRY(sleepqueue) sq_hash; SLIST_ENTRY(sleepqueue) sq_flink; void *sq_wchan; int sq_type; }; /* * Thread creation state attributes. */ #define THR_CREATE_RUNNING 0 #define THR_CREATE_SUSPENDED 1 /* * Miscellaneous definitions. */ #define THR_STACK_DEFAULT (sizeof(void *) / 4 * 1024 * 1024) /* * Maximum size of initial thread's stack. This perhaps deserves to be larger * than the stacks of other threads, since many applications are likely to run * almost entirely on this stack. */ #define THR_STACK_INITIAL (THR_STACK_DEFAULT * 2) /* * Define priorities returned by kernel. */ #define THR_MIN_PRIORITY (_thr_priorities[SCHED_OTHER-1].pri_min) #define THR_MAX_PRIORITY (_thr_priorities[SCHED_OTHER-1].pri_max) #define THR_DEF_PRIORITY (_thr_priorities[SCHED_OTHER-1].pri_default) #define THR_MIN_RR_PRIORITY (_thr_priorities[SCHED_RR-1].pri_min) #define THR_MAX_RR_PRIORITY (_thr_priorities[SCHED_RR-1].pri_max) #define THR_DEF_RR_PRIORITY (_thr_priorities[SCHED_RR-1].pri_default) /* XXX The SCHED_FIFO should have same priority range as SCHED_RR */ #define THR_MIN_FIFO_PRIORITY (_thr_priorities[SCHED_FIFO_1].pri_min) #define THR_MAX_FIFO_PRIORITY (_thr_priorities[SCHED_FIFO-1].pri_max) #define THR_DEF_FIFO_PRIORITY (_thr_priorities[SCHED_FIFO-1].pri_default) struct pthread_prio { int pri_min; int pri_max; int pri_default; }; struct pthread_rwlockattr { int pshared; }; struct pthread_rwlock { struct urwlock lock; - struct pthread *owner; + uint32_t owner; }; /* * Thread states. */ enum pthread_state { PS_RUNNING, PS_DEAD }; struct pthread_specific_elem { const void *data; int seqno; }; struct pthread_key { volatile int allocated; int seqno; void (*destructor)(void *); }; /* * lwpid_t is 32bit but kernel thr API exports tid as long type * to preserve the ABI for M:N model in very early date (r131431). */ #define TID(thread) ((uint32_t) ((thread)->tid)) /* * Thread structure. */ struct pthread { #define _pthread_startzero tid /* Kernel thread id. */ long tid; #define TID_TERMINATED 1 /* * Lock for accesses to this thread structure. */ struct umutex lock; /* Internal condition variable cycle number. */ uint32_t cycle; /* How many low level locks the thread held. */ int locklevel; /* * Set to non-zero when this thread has entered a critical * region. We allow for recursive entries into critical regions. */ int critical_count; /* Signal blocked counter. */ int sigblock; /* Queue entry for list of all threads. */ TAILQ_ENTRY(pthread) tle; /* link for all threads in process */ /* Queue entry for GC lists. */ TAILQ_ENTRY(pthread) gcle; /* Hash queue entry. */ LIST_ENTRY(pthread) hle; /* Sleep queue entry */ TAILQ_ENTRY(pthread) wle; /* Threads reference count. */ int refcount; /* * Thread start routine, argument, stack pointer and thread * attributes. */ void *(*start_routine)(void *); void *arg; struct pthread_attr attr; #define SHOULD_CANCEL(thr) \ ((thr)->cancel_pending && (thr)->cancel_enable && \ (thr)->no_cancel == 0) /* Cancellation is enabled */ int cancel_enable; /* Cancellation request is pending */ int cancel_pending; /* Thread is at cancellation point */ int cancel_point; /* Cancellation is temporarily disabled */ int no_cancel; /* Asynchronouse cancellation is enabled */ int cancel_async; /* Cancellation is in progress */ int cancelling; /* Thread temporary signal mask. */ sigset_t sigmask; /* Thread should unblock SIGCANCEL. */ int unblock_sigcancel; /* In sigsuspend state */ int in_sigsuspend; /* deferred signal info */ siginfo_t deferred_siginfo; /* signal mask to restore. */ sigset_t deferred_sigmask; /* the sigaction should be used for deferred signal. */ struct sigaction deferred_sigact; /* deferred signal delivery is performed, do not reenter. */ int deferred_run; /* Force new thread to exit. */ int force_exit; /* Thread state: */ enum pthread_state state; /* * Error variable used instead of errno. The function __error() * returns a pointer to this. */ int error; /* * The joiner is the thread that is joining to this thread. The * join status keeps track of a join operation to another thread. */ struct pthread *joiner; /* Miscellaneous flags; only set with scheduling lock held. */ int flags; #define THR_FLAGS_PRIVATE 0x0001 #define THR_FLAGS_NEED_SUSPEND 0x0002 /* thread should be suspended */ #define THR_FLAGS_SUSPENDED 0x0004 /* thread is suspended */ #define THR_FLAGS_DETACHED 0x0008 /* thread is detached */ /* Thread list flags; only set with thread list lock held. */ int tlflags; #define TLFLAGS_GC_SAFE 0x0001 /* thread safe for cleaning */ #define TLFLAGS_IN_TDLIST 0x0002 /* thread in all thread list */ #define TLFLAGS_IN_GCLIST 0x0004 /* thread in gc list */ - /* Queue of currently owned NORMAL or PRIO_INHERIT type mutexes. */ - struct mutex_queue mutexq; + /* + * Queues of the owned mutexes. Private queue must have index + * + 1 of the corresponding full queue. + */ +#define TMQ_NORM 0 /* NORMAL or PRIO_INHERIT normal */ +#define TMQ_NORM_PRIV 1 /* NORMAL or PRIO_INHERIT normal priv */ +#define TMQ_NORM_PP 2 /* PRIO_PROTECT normal mutexes */ +#define TMQ_NORM_PP_PRIV 3 /* PRIO_PROTECT normal priv */ +#define TMQ_NITEMS 4 + struct mutex_queue mq[TMQ_NITEMS]; - /* Queue of all owned PRIO_PROTECT mutexes. */ - struct mutex_queue pp_mutexq; - void *ret; struct pthread_specific_elem *specific; int specific_data_count; /* Number rwlocks rdlocks held. */ int rdlock_count; /* * Current locks bitmap for rtld. */ int rtld_bits; /* Thread control block */ struct tcb *tcb; /* Cleanup handlers Link List */ struct pthread_cleanup *cleanup; #ifdef _PTHREAD_FORCED_UNWIND struct _Unwind_Exception ex; void *unwind_stackend; int unwind_disabled; #endif /* * Magic value to help recognize a valid thread structure * from an invalid one: */ #define THR_MAGIC ((u_int32_t) 0xd09ba115) u_int32_t magic; /* Enable event reporting */ int report_events; /* Event mask */ int event_mask; /* Event */ td_event_msg_t event_buf; /* Wait channel */ void *wchan; /* Referenced mutex. */ struct pthread_mutex *mutex_obj; /* Thread will sleep. */ int will_sleep; /* Number of threads deferred. */ int nwaiter_defer; /* Deferred threads from pthread_cond_signal. */ unsigned int *defer_waiters[MAX_DEFER_WAITERS]; #define _pthread_endzero wake_addr struct wake_addr *wake_addr; #define WAKE_ADDR(td) ((td)->wake_addr) /* Sleep queue */ struct sleepqueue *sleepqueue; }; #define THR_SHOULD_GC(thrd) \ ((thrd)->refcount == 0 && (thrd)->state == PS_DEAD && \ ((thrd)->flags & THR_FLAGS_DETACHED) != 0) #define THR_IN_CRITICAL(thrd) \ (((thrd)->locklevel > 0) || \ ((thrd)->critical_count > 0)) #define THR_CRITICAL_ENTER(thrd) \ (thrd)->critical_count++ #define THR_CRITICAL_LEAVE(thrd) \ do { \ (thrd)->critical_count--; \ _thr_ast(thrd); \ } while (0) #define THR_UMUTEX_TRYLOCK(thrd, lck) \ _thr_umutex_trylock((lck), TID(thrd)) #define THR_UMUTEX_LOCK(thrd, lck) \ _thr_umutex_lock((lck), TID(thrd)) #define THR_UMUTEX_TIMEDLOCK(thrd, lck, timo) \ _thr_umutex_timedlock((lck), TID(thrd), (timo)) #define THR_UMUTEX_UNLOCK(thrd, lck) \ _thr_umutex_unlock((lck), TID(thrd)) #define THR_LOCK_ACQUIRE(thrd, lck) \ do { \ (thrd)->locklevel++; \ _thr_umutex_lock(lck, TID(thrd)); \ } while (0) #define THR_LOCK_ACQUIRE_SPIN(thrd, lck) \ do { \ (thrd)->locklevel++; \ _thr_umutex_lock_spin(lck, TID(thrd)); \ } while (0) #ifdef _PTHREADS_INVARIANTS #define THR_ASSERT_LOCKLEVEL(thrd) \ do { \ if (__predict_false((thrd)->locklevel <= 0)) \ _thr_assert_lock_level(); \ } while (0) #else #define THR_ASSERT_LOCKLEVEL(thrd) #endif #define THR_LOCK_RELEASE(thrd, lck) \ do { \ THR_ASSERT_LOCKLEVEL(thrd); \ _thr_umutex_unlock((lck), TID(thrd)); \ (thrd)->locklevel--; \ _thr_ast(thrd); \ } while (0) #define THR_LOCK(curthrd) THR_LOCK_ACQUIRE(curthrd, &(curthrd)->lock) #define THR_UNLOCK(curthrd) THR_LOCK_RELEASE(curthrd, &(curthrd)->lock) #define THR_THREAD_LOCK(curthrd, thr) THR_LOCK_ACQUIRE(curthrd, &(thr)->lock) #define THR_THREAD_UNLOCK(curthrd, thr) THR_LOCK_RELEASE(curthrd, &(thr)->lock) #define THREAD_LIST_RDLOCK(curthrd) \ do { \ (curthrd)->locklevel++; \ _thr_rwl_rdlock(&_thr_list_lock); \ } while (0) #define THREAD_LIST_WRLOCK(curthrd) \ do { \ (curthrd)->locklevel++; \ _thr_rwl_wrlock(&_thr_list_lock); \ } while (0) #define THREAD_LIST_UNLOCK(curthrd) \ do { \ _thr_rwl_unlock(&_thr_list_lock); \ (curthrd)->locklevel--; \ _thr_ast(curthrd); \ } while (0) /* * Macros to insert/remove threads to the all thread list and * the gc list. */ #define THR_LIST_ADD(thrd) do { \ if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) == 0) { \ TAILQ_INSERT_HEAD(&_thread_list, thrd, tle); \ _thr_hash_add(thrd); \ (thrd)->tlflags |= TLFLAGS_IN_TDLIST; \ } \ } while (0) #define THR_LIST_REMOVE(thrd) do { \ if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) != 0) { \ TAILQ_REMOVE(&_thread_list, thrd, tle); \ _thr_hash_remove(thrd); \ (thrd)->tlflags &= ~TLFLAGS_IN_TDLIST; \ } \ } while (0) #define THR_GCLIST_ADD(thrd) do { \ if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) == 0) { \ TAILQ_INSERT_HEAD(&_thread_gc_list, thrd, gcle);\ (thrd)->tlflags |= TLFLAGS_IN_GCLIST; \ _gc_count++; \ } \ } while (0) #define THR_GCLIST_REMOVE(thrd) do { \ if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) != 0) { \ TAILQ_REMOVE(&_thread_gc_list, thrd, gcle); \ (thrd)->tlflags &= ~TLFLAGS_IN_GCLIST; \ _gc_count--; \ } \ } while (0) #define THR_REF_ADD(curthread, pthread) { \ THR_CRITICAL_ENTER(curthread); \ pthread->refcount++; \ } while (0) #define THR_REF_DEL(curthread, pthread) { \ pthread->refcount--; \ THR_CRITICAL_LEAVE(curthread); \ } while (0) #define GC_NEEDED() (_gc_count >= 5) #define SHOULD_REPORT_EVENT(curthr, e) \ (curthr->report_events && \ (((curthr)->event_mask | _thread_event_mask ) & e) != 0) extern int __isthreaded; /* * Global variables for the pthread kernel. */ extern char *_usrstack __hidden; extern struct pthread *_thr_initial __hidden; /* For debugger */ extern int _libthr_debug; extern int _thread_event_mask; extern struct pthread *_thread_last_event; /* List of all threads: */ extern pthreadlist _thread_list; /* List of threads needing GC: */ extern pthreadlist _thread_gc_list __hidden; extern int _thread_active_threads; extern atfork_head _thr_atfork_list __hidden; extern struct urwlock _thr_atfork_lock __hidden; /* Default thread attributes: */ extern struct pthread_attr _pthread_attr_default __hidden; /* Default mutex attributes: */ extern struct pthread_mutex_attr _pthread_mutexattr_default __hidden; extern struct pthread_mutex_attr _pthread_mutexattr_adaptive_default __hidden; /* Default condition variable attributes: */ extern struct pthread_cond_attr _pthread_condattr_default __hidden; extern struct pthread_prio _thr_priorities[] __hidden; extern pid_t _thr_pid __hidden; extern int _thr_is_smp __hidden; extern size_t _thr_guard_default __hidden; extern size_t _thr_stack_default __hidden; extern size_t _thr_stack_initial __hidden; extern int _thr_page_size __hidden; extern int _thr_spinloops __hidden; extern int _thr_yieldloops __hidden; extern int _thr_queuefifo __hidden; /* Garbage thread count. */ extern int _gc_count __hidden; extern struct umutex _mutex_static_lock __hidden; extern struct umutex _cond_static_lock __hidden; extern struct umutex _rwlock_static_lock __hidden; extern struct umutex _keytable_lock __hidden; extern struct urwlock _thr_list_lock __hidden; extern struct umutex _thr_event_lock __hidden; extern struct umutex _suspend_all_lock __hidden; extern int _suspend_all_waiters __hidden; extern int _suspend_all_cycle __hidden; extern struct pthread *_single_thread __hidden; /* * Function prototype definitions. */ __BEGIN_DECLS int _thr_setthreaded(int) __hidden; int _mutex_cv_lock(struct pthread_mutex *, int) __hidden; int _mutex_cv_unlock(struct pthread_mutex *, int *, int *) __hidden; int _mutex_cv_attach(struct pthread_mutex *, int) __hidden; int _mutex_cv_detach(struct pthread_mutex *, int *) __hidden; int _mutex_owned(struct pthread *, const struct pthread_mutex *) __hidden; int _mutex_reinit(pthread_mutex_t *) __hidden; void _mutex_fork(struct pthread *curthread) __hidden; void _libpthread_init(struct pthread *) __hidden; struct pthread *_thr_alloc(struct pthread *) __hidden; void _thread_exit(const char *, int, const char *) __hidden __dead2; int _thr_ref_add(struct pthread *, struct pthread *, int) __hidden; void _thr_ref_delete(struct pthread *, struct pthread *) __hidden; void _thr_ref_delete_unlocked(struct pthread *, struct pthread *) __hidden; int _thr_find_thread(struct pthread *, struct pthread *, int) __hidden; void _thr_rtld_init(void) __hidden; void _thr_rtld_postfork_child(void) __hidden; int _thr_stack_alloc(struct pthread_attr *) __hidden; void _thr_stack_free(struct pthread_attr *) __hidden; void _thr_free(struct pthread *, struct pthread *) __hidden; void _thr_gc(struct pthread *) __hidden; void _thread_cleanupspecific(void) __hidden; void _thread_printf(int, const char *, ...) __hidden; void _thr_spinlock_init(void) __hidden; void _thr_cancel_enter(struct pthread *) __hidden; void _thr_cancel_enter2(struct pthread *, int) __hidden; void _thr_cancel_leave(struct pthread *, int) __hidden; void _thr_testcancel(struct pthread *) __hidden; void _thr_signal_block(struct pthread *) __hidden; void _thr_signal_unblock(struct pthread *) __hidden; void _thr_signal_init(int) __hidden; void _thr_signal_deinit(void) __hidden; int _thr_send_sig(struct pthread *, int sig) __hidden; void _thr_list_init(void) __hidden; void _thr_hash_add(struct pthread *) __hidden; void _thr_hash_remove(struct pthread *) __hidden; struct pthread *_thr_hash_find(struct pthread *) __hidden; void _thr_link(struct pthread *, struct pthread *) __hidden; void _thr_unlink(struct pthread *, struct pthread *) __hidden; void _thr_assert_lock_level(void) __hidden __dead2; void _thr_ast(struct pthread *) __hidden; void _thr_once_init(void) __hidden; void _thr_report_creation(struct pthread *curthread, struct pthread *newthread) __hidden; void _thr_report_death(struct pthread *curthread) __hidden; int _thr_getscheduler(lwpid_t, int *, struct sched_param *) __hidden; int _thr_setscheduler(lwpid_t, int, const struct sched_param *) __hidden; void _thr_signal_prefork(void) __hidden; void _thr_signal_postfork(void) __hidden; void _thr_signal_postfork_child(void) __hidden; void _thr_suspend_all_lock(struct pthread *) __hidden; void _thr_suspend_all_unlock(struct pthread *) __hidden; void _thr_try_gc(struct pthread *, struct pthread *) __hidden; int _rtp_to_schedparam(const struct rtprio *rtp, int *policy, struct sched_param *param) __hidden; int _schedparam_to_rtp(int policy, const struct sched_param *param, struct rtprio *rtp) __hidden; void _thread_bp_create(void); void _thread_bp_death(void); int _sched_yield(void); void _pthread_cleanup_push(void (*)(void *), void *); void _pthread_cleanup_pop(int); void _pthread_exit_mask(void *status, sigset_t *mask) __dead2 __hidden; void _pthread_cancel_enter(int maycancel); void _pthread_cancel_leave(int maycancel); /* #include */ #ifdef _SYS_FCNTL_H_ int __sys_fcntl(int, int, ...); int __sys_openat(int, const char *, int, ...); #endif /* #include */ #ifdef _SIGNAL_H_ int __sys_kill(pid_t, int); int __sys_sigaction(int, const struct sigaction *, struct sigaction *); int __sys_sigpending(sigset_t *); int __sys_sigprocmask(int, const sigset_t *, sigset_t *); int __sys_sigsuspend(const sigset_t *); int __sys_sigreturn(const ucontext_t *); int __sys_sigaltstack(const struct sigaltstack *, struct sigaltstack *); int __sys_sigwait(const sigset_t *, int *); int __sys_sigtimedwait(const sigset_t *, siginfo_t *, const struct timespec *); int __sys_sigwaitinfo(const sigset_t *set, siginfo_t *info); #endif /* #include */ #ifdef _TIME_H_ int __sys_nanosleep(const struct timespec *, struct timespec *); #endif /* #include */ #ifdef _SYS_UCONTEXT_H_ int __sys_setcontext(const ucontext_t *ucp); int __sys_swapcontext(ucontext_t *oucp, const ucontext_t *ucp); #endif /* #include */ #ifdef _UNISTD_H_ int __sys_close(int); int __sys_fork(void); pid_t __sys_getpid(void); ssize_t __sys_read(int, void *, size_t); void __sys_exit(int); #endif static inline int _thr_isthreaded(void) { return (__isthreaded != 0); } static inline int _thr_is_inited(void) { return (_thr_initial != NULL); } static inline void _thr_check_init(void) { if (_thr_initial == NULL) _libpthread_init(NULL); } struct wake_addr *_thr_alloc_wake_addr(void); void _thr_release_wake_addr(struct wake_addr *); int _thr_sleep(struct pthread *, int, const struct timespec *); void _thr_wake_addr_init(void) __hidden; static inline void _thr_clear_wake(struct pthread *td) { td->wake_addr->value = 0; } static inline int _thr_is_woken(struct pthread *td) { return td->wake_addr->value != 0; } static inline void _thr_set_wake(unsigned int *waddr) { *waddr = 1; _thr_umtx_wake(waddr, INT_MAX, 0); } void _thr_wake_all(unsigned int *waddrs[], int) __hidden; static inline struct pthread * _sleepq_first(struct sleepqueue *sq) { return TAILQ_FIRST(&sq->sq_blocked); } void _sleepq_init(void) __hidden; struct sleepqueue *_sleepq_alloc(void) __hidden; void _sleepq_free(struct sleepqueue *) __hidden; void _sleepq_lock(void *) __hidden; void _sleepq_unlock(void *) __hidden; struct sleepqueue *_sleepq_lookup(void *) __hidden; void _sleepq_add(void *, struct pthread *) __hidden; int _sleepq_remove(struct sleepqueue *, struct pthread *) __hidden; void _sleepq_drop(struct sleepqueue *, void (*cb)(struct pthread *, void *arg), void *) __hidden; int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, void *(calloc_cb)(size_t, size_t)); struct dl_phdr_info; void __pthread_cxa_finalize(struct dl_phdr_info *phdr_info); void _thr_tsd_unload(struct dl_phdr_info *phdr_info) __hidden; void _thr_sigact_unload(struct dl_phdr_info *phdr_info) __hidden; void _thr_stack_fix_protection(struct pthread *thrd); int *__error_threaded(void) __hidden; void __thr_interpose_libc(void) __hidden; pid_t __thr_fork(void); int __thr_setcontext(const ucontext_t *ucp); int __thr_sigaction(int sig, const struct sigaction *act, struct sigaction *oact) __hidden; int __thr_sigprocmask(int how, const sigset_t *set, sigset_t *oset); int __thr_sigsuspend(const sigset_t * set); int __thr_sigtimedwait(const sigset_t *set, siginfo_t *info, const struct timespec * timeout); int __thr_sigwait(const sigset_t *set, int *sig); int __thr_sigwaitinfo(const sigset_t *set, siginfo_t *info); int __thr_swapcontext(ucontext_t *oucp, const ucontext_t *ucp); void __thr_map_stacks_exec(void); struct _spinlock; void __thr_spinunlock(struct _spinlock *lck); void __thr_spinlock(struct _spinlock *lck); struct tcb *_tcb_ctor(struct pthread *, int); void _tcb_dtor(struct tcb *); + +void __thr_pshared_init(void) __hidden; +void *__thr_pshared_offpage(void *key, int doalloc) __hidden; +void __thr_pshared_destroy(void *key) __hidden; __END_DECLS #endif /* !_THR_PRIVATE_H */ Index: head/lib/libthr/thread/thr_pshared.c =================================================================== --- head/lib/libthr/thread/thr_pshared.c (nonexistent) +++ head/lib/libthr/thread/thr_pshared.c (revision 296162) @@ -0,0 +1,223 @@ +/*- + * Copyright (c) 2015 The FreeBSD Foundation + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include "namespace.h" +#include +#include "un-namespace.h" + +#include "thr_private.h" + +struct psh { + LIST_ENTRY(psh) link; + void *key; + void *val; +}; + +LIST_HEAD(pshared_hash_head, psh); +#define HASH_SIZE 128 +static struct pshared_hash_head pshared_hash[HASH_SIZE]; +#define PSHARED_KEY_HASH(key) (((unsigned long)(key) >> 8) % HASH_SIZE) +/* XXXKIB: lock could be split to per-hash chain, if appears contested */ +static struct urwlock pshared_lock = DEFAULT_URWLOCK; + +void +__thr_pshared_init(void) +{ + int i; + + _thr_urwlock_init(&pshared_lock); + for (i = 0; i < HASH_SIZE; i++) + LIST_INIT(&pshared_hash[i]); +} + +static void +pshared_rlock(struct pthread *curthread) +{ + + curthread->locklevel++; + _thr_rwl_rdlock(&pshared_lock); +} + +static void +pshared_wlock(struct pthread *curthread) +{ + + curthread->locklevel++; + _thr_rwl_wrlock(&pshared_lock); +} + +static void +pshared_unlock(struct pthread *curthread) +{ + + _thr_rwl_unlock(&pshared_lock); + curthread->locklevel--; + _thr_ast(curthread); +} + +static void +pshared_gc(struct pthread *curthread) +{ + struct pshared_hash_head *hd; + struct psh *h, *h1; + int error, i; + + pshared_wlock(curthread); + for (i = 0; i < HASH_SIZE; i++) { + hd = &pshared_hash[i]; + LIST_FOREACH_SAFE(h, hd, link, h1) { + error = _umtx_op(NULL, UMTX_OP_SHM, UMTX_SHM_ALIVE, + h->val, NULL); + if (error == 0) + continue; + LIST_REMOVE(h, link); + munmap(h->val, PAGE_SIZE); + free(h); + } + } + pshared_unlock(curthread); +} + +static void * +pshared_lookup(void *key) +{ + struct pshared_hash_head *hd; + struct psh *h; + + hd = &pshared_hash[PSHARED_KEY_HASH(key)]; + LIST_FOREACH(h, hd, link) { + if (h->key == key) + return (h->val); + } + return (NULL); +} + +static int +pshared_insert(void *key, void **val) +{ + struct pshared_hash_head *hd; + struct psh *h; + + hd = &pshared_hash[PSHARED_KEY_HASH(key)]; + LIST_FOREACH(h, hd, link) { + if (h->key == key) { + if (h->val != *val) { + munmap(*val, PAGE_SIZE); + *val = h->val; + } + return (1); + } + } + + h = malloc(sizeof(*h)); + if (h == NULL) + return (0); + h->key = key; + h->val = *val; + LIST_INSERT_HEAD(hd, h, link); + return (1); +} + +static void * +pshared_remove(void *key) +{ + struct pshared_hash_head *hd; + struct psh *h; + void *val; + + hd = &pshared_hash[PSHARED_KEY_HASH(key)]; + LIST_FOREACH(h, hd, link) { + if (h->key == key) { + LIST_REMOVE(h, link); + val = h->val; + free(h); + return (val); + } + } + return (NULL); +} + +static void +pshared_clean(void *key, void *val) +{ + + if (val != NULL) + munmap(val, PAGE_SIZE); + _umtx_op(NULL, UMTX_OP_SHM, UMTX_SHM_DESTROY, key, NULL); +} + +void * +__thr_pshared_offpage(void *key, int doalloc) +{ + struct pthread *curthread; + void *res; + int fd, ins_done; + + curthread = _get_curthread(); + pshared_rlock(curthread); + res = pshared_lookup(key); + pshared_unlock(curthread); + if (res != NULL) + return (res); + fd = _umtx_op(NULL, UMTX_OP_SHM, doalloc ? UMTX_SHM_CREAT : + UMTX_SHM_LOOKUP, key, NULL); + if (fd == -1) + return (NULL); + res = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (res == MAP_FAILED) + return (NULL); + pshared_wlock(curthread); + ins_done = pshared_insert(key, &res); + pshared_unlock(curthread); + if (!ins_done) { + pshared_clean(key, res); + res = NULL; + } + return (res); +} + +void +__thr_pshared_destroy(void *key) +{ + struct pthread *curthread; + void *val; + + curthread = _get_curthread(); + pshared_wlock(curthread); + val = pshared_remove(key); + pshared_unlock(curthread); + pshared_clean(key, val); + pshared_gc(curthread); +} Property changes on: head/lib/libthr/thread/thr_pshared.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/lib/libthr/thread/thr_rwlock.c =================================================================== --- head/lib/libthr/thread/thr_rwlock.c (revision 296161) +++ head/lib/libthr/thread/thr_rwlock.c (revision 296162) @@ -1,324 +1,348 @@ /*- * Copyright (c) 1998 Alex Nash * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include "namespace.h" #include #include "un-namespace.h" #include "thr_private.h" __weak_reference(_pthread_rwlock_destroy, pthread_rwlock_destroy); __weak_reference(_pthread_rwlock_init, pthread_rwlock_init); __weak_reference(_pthread_rwlock_rdlock, pthread_rwlock_rdlock); __weak_reference(_pthread_rwlock_timedrdlock, pthread_rwlock_timedrdlock); __weak_reference(_pthread_rwlock_tryrdlock, pthread_rwlock_tryrdlock); __weak_reference(_pthread_rwlock_trywrlock, pthread_rwlock_trywrlock); __weak_reference(_pthread_rwlock_unlock, pthread_rwlock_unlock); __weak_reference(_pthread_rwlock_wrlock, pthread_rwlock_wrlock); __weak_reference(_pthread_rwlock_timedwrlock, pthread_rwlock_timedwrlock); #define CHECK_AND_INIT_RWLOCK \ - if (__predict_false((prwlock = (*rwlock)) <= THR_RWLOCK_DESTROYED)) { \ + if (*rwlock == THR_PSHARED_PTR) { \ + prwlock = __thr_pshared_offpage(rwlock, 0); \ + if (prwlock == NULL) \ + return (EINVAL); \ + } else if (__predict_false((prwlock = (*rwlock)) <= \ + THR_RWLOCK_DESTROYED)) { \ if (prwlock == THR_RWLOCK_INITIALIZER) { \ int ret; \ ret = init_static(_get_curthread(), rwlock); \ if (ret) \ return (ret); \ } else if (prwlock == THR_RWLOCK_DESTROYED) { \ return (EINVAL); \ } \ prwlock = *rwlock; \ } /* * Prototypes */ static int -rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr __unused) +rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr) { pthread_rwlock_t prwlock; - prwlock = (pthread_rwlock_t)calloc(1, sizeof(struct pthread_rwlock)); - if (prwlock == NULL) - return (ENOMEM); - *rwlock = prwlock; + if (attr == NULL || *attr == NULL || + (*attr)->pshared == PTHREAD_PROCESS_PRIVATE) { + prwlock = calloc(1, sizeof(struct pthread_rwlock)); + if (prwlock == NULL) + return (ENOMEM); + *rwlock = prwlock; + } else { + prwlock = __thr_pshared_offpage(rwlock, 1); + if (prwlock == NULL) + return (EFAULT); + prwlock->lock.rw_flags |= USYNC_PROCESS_SHARED; + *rwlock = THR_PSHARED_PTR; + } return (0); } int _pthread_rwlock_destroy (pthread_rwlock_t *rwlock) { pthread_rwlock_t prwlock; int ret; prwlock = *rwlock; if (prwlock == THR_RWLOCK_INITIALIZER) ret = 0; else if (prwlock == THR_RWLOCK_DESTROYED) ret = EINVAL; - else { + else if (prwlock == THR_PSHARED_PTR) { *rwlock = THR_RWLOCK_DESTROYED; - + __thr_pshared_destroy(rwlock); + ret = 0; + } else { + *rwlock = THR_RWLOCK_DESTROYED; free(prwlock); ret = 0; } return (ret); } static int init_static(struct pthread *thread, pthread_rwlock_t *rwlock) { int ret; THR_LOCK_ACQUIRE(thread, &_rwlock_static_lock); if (*rwlock == THR_RWLOCK_INITIALIZER) ret = rwlock_init(rwlock, NULL); else ret = 0; THR_LOCK_RELEASE(thread, &_rwlock_static_lock); return (ret); } int -_pthread_rwlock_init (pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr) +_pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr) { + *rwlock = NULL; return (rwlock_init(rwlock, attr)); } static int rwlock_rdlock_common(pthread_rwlock_t *rwlock, const struct timespec *abstime) { struct pthread *curthread = _get_curthread(); pthread_rwlock_t prwlock; int flags; int ret; CHECK_AND_INIT_RWLOCK if (curthread->rdlock_count) { /* * To avoid having to track all the rdlocks held by * a thread or all of the threads that hold a rdlock, * we keep a simple count of all the rdlocks held by * a thread. If a thread holds any rdlocks it is * possible that it is attempting to take a recursive * rdlock. If there are blocked writers and precedence * is given to them, then that would result in the thread * deadlocking. So allowing a thread to take the rdlock * when it already has one or more rdlocks avoids the * deadlock. I hope the reader can follow that logic ;-) */ flags = URWLOCK_PREFER_READER; } else { flags = 0; } /* * POSIX said the validity of the abstimeout parameter need * not be checked if the lock can be immediately acquired. */ ret = _thr_rwlock_tryrdlock(&prwlock->lock, flags); if (ret == 0) { curthread->rdlock_count++; return (ret); } if (__predict_false(abstime && (abstime->tv_nsec >= 1000000000 || abstime->tv_nsec < 0))) return (EINVAL); for (;;) { /* goto kernel and lock it */ ret = __thr_rwlock_rdlock(&prwlock->lock, flags, abstime); if (ret != EINTR) break; /* if interrupted, try to lock it in userland again. */ if (_thr_rwlock_tryrdlock(&prwlock->lock, flags) == 0) { ret = 0; break; } } if (ret == 0) curthread->rdlock_count++; return (ret); } int _pthread_rwlock_rdlock (pthread_rwlock_t *rwlock) { return (rwlock_rdlock_common(rwlock, NULL)); } int _pthread_rwlock_timedrdlock (pthread_rwlock_t *rwlock, const struct timespec *abstime) { return (rwlock_rdlock_common(rwlock, abstime)); } int _pthread_rwlock_tryrdlock (pthread_rwlock_t *rwlock) { struct pthread *curthread = _get_curthread(); pthread_rwlock_t prwlock; int flags; int ret; CHECK_AND_INIT_RWLOCK if (curthread->rdlock_count) { /* * To avoid having to track all the rdlocks held by * a thread or all of the threads that hold a rdlock, * we keep a simple count of all the rdlocks held by * a thread. If a thread holds any rdlocks it is * possible that it is attempting to take a recursive * rdlock. If there are blocked writers and precedence * is given to them, then that would result in the thread * deadlocking. So allowing a thread to take the rdlock * when it already has one or more rdlocks avoids the * deadlock. I hope the reader can follow that logic ;-) */ flags = URWLOCK_PREFER_READER; } else { flags = 0; } ret = _thr_rwlock_tryrdlock(&prwlock->lock, flags); if (ret == 0) curthread->rdlock_count++; return (ret); } int _pthread_rwlock_trywrlock (pthread_rwlock_t *rwlock) { struct pthread *curthread = _get_curthread(); pthread_rwlock_t prwlock; int ret; CHECK_AND_INIT_RWLOCK ret = _thr_rwlock_trywrlock(&prwlock->lock); if (ret == 0) - prwlock->owner = curthread; + prwlock->owner = TID(curthread); return (ret); } static int rwlock_wrlock_common (pthread_rwlock_t *rwlock, const struct timespec *abstime) { struct pthread *curthread = _get_curthread(); pthread_rwlock_t prwlock; int ret; CHECK_AND_INIT_RWLOCK /* * POSIX said the validity of the abstimeout parameter need * not be checked if the lock can be immediately acquired. */ ret = _thr_rwlock_trywrlock(&prwlock->lock); if (ret == 0) { - prwlock->owner = curthread; + prwlock->owner = TID(curthread); return (ret); } if (__predict_false(abstime && - (abstime->tv_nsec >= 1000000000 || abstime->tv_nsec < 0))) + (abstime->tv_nsec >= 1000000000 || abstime->tv_nsec < 0))) return (EINVAL); for (;;) { /* goto kernel and lock it */ ret = __thr_rwlock_wrlock(&prwlock->lock, abstime); if (ret == 0) { - prwlock->owner = curthread; + prwlock->owner = TID(curthread); break; } if (ret != EINTR) break; /* if interrupted, try to lock it in userland again. */ if (_thr_rwlock_trywrlock(&prwlock->lock) == 0) { ret = 0; - prwlock->owner = curthread; + prwlock->owner = TID(curthread); break; } } return (ret); } int _pthread_rwlock_wrlock (pthread_rwlock_t *rwlock) { return (rwlock_wrlock_common (rwlock, NULL)); } int _pthread_rwlock_timedwrlock (pthread_rwlock_t *rwlock, const struct timespec *abstime) { return (rwlock_wrlock_common (rwlock, abstime)); } int -_pthread_rwlock_unlock (pthread_rwlock_t *rwlock) +_pthread_rwlock_unlock(pthread_rwlock_t *rwlock) { struct pthread *curthread = _get_curthread(); pthread_rwlock_t prwlock; int ret; int32_t state; - prwlock = *rwlock; + if (*rwlock == THR_PSHARED_PTR) { + prwlock = __thr_pshared_offpage(rwlock, 0); + if (prwlock == NULL) + return (EINVAL); + } else { + prwlock = *rwlock; + } if (__predict_false(prwlock <= THR_RWLOCK_DESTROYED)) return (EINVAL); state = prwlock->lock.rw_state; if (state & URWLOCK_WRITE_OWNER) { - if (__predict_false(prwlock->owner != curthread)) + if (__predict_false(prwlock->owner != TID(curthread))) return (EPERM); - prwlock->owner = NULL; + prwlock->owner = 0; } ret = _thr_rwlock_unlock(&prwlock->lock); if (ret == 0 && (state & URWLOCK_WRITE_OWNER) == 0) curthread->rdlock_count--; return (ret); } Index: head/lib/libthr/thread/thr_rwlockattr.c =================================================================== --- head/lib/libthr/thread/thr_rwlockattr.c (revision 296161) +++ head/lib/libthr/thread/thr_rwlockattr.c (revision 296162) @@ -1,99 +1,91 @@ /*- * Copyright (c) 1998 Alex Nash * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "namespace.h" #include #include #include #include "un-namespace.h" #include "thr_private.h" __weak_reference(_pthread_rwlockattr_destroy, pthread_rwlockattr_destroy); __weak_reference(_pthread_rwlockattr_getpshared, pthread_rwlockattr_getpshared); __weak_reference(_pthread_rwlockattr_init, pthread_rwlockattr_init); __weak_reference(_pthread_rwlockattr_setpshared, pthread_rwlockattr_setpshared); int _pthread_rwlockattr_destroy(pthread_rwlockattr_t *rwlockattr) { pthread_rwlockattr_t prwlockattr; if (rwlockattr == NULL) - return(EINVAL); - + return (EINVAL); prwlockattr = *rwlockattr; - if (prwlockattr == NULL) - return(EINVAL); - + return (EINVAL); free(prwlockattr); - - return(0); + return (0); } int _pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *rwlockattr, - int *pshared) + int *pshared) { - *pshared = (*rwlockattr)->pshared; - return(0); + *pshared = (*rwlockattr)->pshared; + return (0); } int _pthread_rwlockattr_init(pthread_rwlockattr_t *rwlockattr) { pthread_rwlockattr_t prwlockattr; if (rwlockattr == NULL) - return(EINVAL); + return (EINVAL); - prwlockattr = (pthread_rwlockattr_t) - malloc(sizeof(struct pthread_rwlockattr)); - + prwlockattr = malloc(sizeof(struct pthread_rwlockattr)); if (prwlockattr == NULL) - return(ENOMEM); + return (ENOMEM); - prwlockattr->pshared = PTHREAD_PROCESS_PRIVATE; - *rwlockattr = prwlockattr; - - return(0); + prwlockattr->pshared = PTHREAD_PROCESS_PRIVATE; + *rwlockattr = prwlockattr; + return (0); } int _pthread_rwlockattr_setpshared(pthread_rwlockattr_t *rwlockattr, int pshared) { - /* Only PTHREAD_PROCESS_PRIVATE is supported. */ - if (pshared != PTHREAD_PROCESS_PRIVATE) - return(EINVAL); + if (pshared != PTHREAD_PROCESS_PRIVATE && + pshared != PTHREAD_PROCESS_SHARED) + return (EINVAL); (*rwlockattr)->pshared = pshared; - - return(0); + return (0); } Index: head/sys/kern/kern_resource.c =================================================================== --- head/sys/kern/kern_resource.c (revision 296161) +++ head/sys/kern/kern_resource.c (revision 296162) @@ -1,1434 +1,1441 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) static struct rwlock uihashtbl_lock; static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; static u_long uihash; /* size of hash table - 1 */ static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp); static int donice(struct thread *td, struct proc *chgp, int n); static struct uidinfo *uilookup(uid_t uid); static void ruxagg_locked(struct rusage_ext *rux, struct thread *td); /* * Resource controls and accounting. */ #ifndef _SYS_SYSPROTO_H_ struct getpriority_args { int which; int who; }; #endif int sys_getpriority(struct thread *td, register struct getpriority_args *uap) { struct proc *p; struct pgrp *pg; int error, low; error = 0; low = PRIO_MAX + 1; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) low = td->td_proc->p_nice; else { p = pfind(uap->who); if (p == NULL) break; if (p_cansee(td, p) == 0) low = p->p_nice; PROC_UNLOCK(p); } break; case PRIO_PGRP: sx_slock(&proctree_lock); if (uap->who == 0) { pg = td->td_proc->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0 && p->p_ucred->cr_uid == uap->who) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (low == PRIO_MAX + 1 && error == 0) error = ESRCH; td->td_retval[0] = low; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setpriority_args { int which; int who; int prio; }; #endif int sys_setpriority(struct thread *td, struct setpriority_args *uap) { struct proc *curp, *p; struct pgrp *pg; int found = 0, error = 0; curp = td->td_proc; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) { PROC_LOCK(curp); error = donice(td, curp, uap->prio); PROC_UNLOCK(curp); } else { p = pfind(uap->who); if (p == NULL) break; error = p_cansee(td, p); if (error == 0) error = donice(td, p, uap->prio); PROC_UNLOCK(p); } found++; break; case PRIO_PGRP: sx_slock(&proctree_lock); if (uap->who == 0) { pg = curp->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p->p_ucred->cr_uid == uap->who && p_cansee(td, p) == 0) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (found == 0 && error == 0) error = ESRCH; return (error); } /* * Set "nice" for a (whole) process. */ static int donice(struct thread *td, struct proc *p, int n) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = p_cansched(td, p))) return (error); if (n > PRIO_MAX) n = PRIO_MAX; if (n < PRIO_MIN) n = PRIO_MIN; if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0) return (EACCES); sched_nice(p, n); return (0); } static int unprivileged_idprio; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_idprio, CTLFLAG_RW, &unprivileged_idprio, 0, "Allow non-root users to set an idle priority"); /* * Set realtime priority for LWP. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_thread_args { int function; lwpid_t lwpid; struct rtprio *rtp; }; #endif int sys_rtprio_thread(struct thread *td, struct rtprio_thread_args *uap) { struct proc *p; struct rtprio rtp; struct thread *td1; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; if (uap->lwpid == 0 || uap->lwpid == td->td_tid) { p = td->td_proc; td1 = td; PROC_LOCK(p); } else { /* Only look up thread in current process */ td1 = tdfind(uap->lwpid, curproc->p_pid); if (td1 == NULL) return (ESRCH); p = td1->td_proc; } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; pri_to_rtp(td1, &rtp); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* Disallow setting rtprio in most cases if not superuser. */ /* * Realtime priority has to be restricted for reasons which * should be obvious. However, for idleprio processes, there is * a potential for system deadlock if an idleprio process gains * a lock on a resource that other processes need (and the * idleprio process can't run due to a CPU-bound normal * process). Fix me! XXX * * This problem is not only related to idleprio process. * A user level program can obtain a file lock and hold it * indefinitely. Additionally, without idleprio processes it is * still conceivable that a program with low priority will never * get to run. In short, allowing this feature might make it * easier to lock a resource indefinitely, but it is not the * only thing that makes it possible. */ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME || (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE && unprivileged_idprio == 0)) { error = priv_check(td, PRIV_SCHED_RTPRIO); if (error) break; } error = rtp_to_pri(&rtp, td1); break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } /* * Set realtime priority. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_args { int function; pid_t pid; struct rtprio *rtp; }; #endif int sys_rtprio(struct thread *td, register struct rtprio_args *uap) { struct proc *p; struct thread *tdp; struct rtprio rtp; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; /* * Return OUR priority if no pid specified, * or if one is, report the highest priority * in the process. There isn't much more you can do as * there is only room to return a single priority. * Note: specifying our own pid is not the same * as leaving it zero. */ if (uap->pid == 0) { pri_to_rtp(td, &rtp); } else { struct rtprio rtp2; rtp.type = RTP_PRIO_IDLE; rtp.prio = RTP_PRIO_MAX; FOREACH_THREAD_IN_PROC(p, tdp) { pri_to_rtp(tdp, &rtp2); if (rtp2.type < rtp.type || (rtp2.type == rtp.type && rtp2.prio < rtp.prio)) { rtp.type = rtp2.type; rtp.prio = rtp2.prio; } } } PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* * Disallow setting rtprio in most cases if not superuser. * See the comment in sys_rtprio_thread about idprio * threads holding a lock. */ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME || (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE && !unprivileged_idprio)) { error = priv_check(td, PRIV_SCHED_RTPRIO); if (error) break; } /* * If we are setting our own priority, set just our * thread but if we are doing another process, * do all the threads on that process. If we * specify our own pid we do the latter. */ if (uap->pid == 0) { error = rtp_to_pri(&rtp, td); } else { FOREACH_THREAD_IN_PROC(p, td) { if ((error = rtp_to_pri(&rtp, td)) != 0) break; } } break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } int rtp_to_pri(struct rtprio *rtp, struct thread *td) { u_char newpri, oldclass, oldpri; switch (RTP_PRIO_BASE(rtp->type)) { case RTP_PRIO_REALTIME: if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); newpri = PRI_MIN_REALTIME + rtp->prio; break; case RTP_PRIO_NORMAL: if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) return (EINVAL); newpri = PRI_MIN_TIMESHARE + rtp->prio; break; case RTP_PRIO_IDLE: if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); newpri = PRI_MIN_IDLE + rtp->prio; break; default: return (EINVAL); } thread_lock(td); oldclass = td->td_pri_class; sched_class(td, rtp->type); /* XXX fix */ oldpri = td->td_user_pri; sched_user_prio(td, newpri); if (td->td_user_pri != oldpri && (oldclass != RTP_PRIO_NORMAL || td->td_pri_class != RTP_PRIO_NORMAL)) sched_prio(td, td->td_user_pri); if (TD_ON_UPILOCK(td) && oldpri != newpri) { critical_enter(); thread_unlock(td); umtx_pi_adjust(td, oldpri); critical_exit(); } else thread_unlock(td); return (0); } void pri_to_rtp(struct thread *td, struct rtprio *rtp) { thread_lock(td); switch (PRI_BASE(td->td_pri_class)) { case PRI_REALTIME: rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME; break; case PRI_TIMESHARE: rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE; break; case PRI_IDLE: rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE; break; default: break; } rtp->type = td->td_pri_class; thread_unlock(td); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int osetrlimit(struct thread *td, register struct osetrlimit_args *uap) { struct orlimit olim; struct rlimit lim; int error; if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit)))) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; error = kern_setrlimit(td, uap->which, &lim); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ogetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int ogetrlimit(struct thread *td, register struct ogetrlimit_args *uap) { struct orlimit olim; struct rlimit rl; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); lim_rlimit(td, uap->which, &rl); /* * XXX would be more correct to convert only RLIM_INFINITY to the * old RLIM_INFINITY and fail with EOVERFLOW for other larger * values. Most 64->32 and 32->16 conversions, including not * unimportant ones of uids are even more broken than what we * do here (they blindly truncate). We don't do this correctly * here since we have little experience with EOVERFLOW yet. * Elsewhere, getuid() can't fail... */ olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur; olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max; error = copyout(&olim, uap->rlp, sizeof(olim)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct __setrlimit_args { u_int which; struct rlimit *rlp; }; #endif int sys_setrlimit(struct thread *td, register struct __setrlimit_args *uap) { struct rlimit alim; int error; if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit)))) return (error); error = kern_setrlimit(td, uap->which, &alim); return (error); } static void lim_cb(void *arg) { struct rlimit rlim; struct thread *td; struct proc *p; p = arg; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if the process exceeds its cpu resource allocation. If * it reaches the max, arrange to kill the process in ast(). */ if (p->p_cpulimit == RLIM_INFINITY) return; PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { ruxagg(p, td); } PROC_STATUNLOCK(p); if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) { lim_rlimit_proc(p, RLIMIT_CPU, &rlim); if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) { killproc(p, "exceeded maximum CPU limit"); } else { if (p->p_cpulimit < rlim.rlim_max) p->p_cpulimit += 5; kern_psignal(p, SIGXCPU); } } if ((p->p_flag & P_WEXIT) == 0) callout_reset_sbt(&p->p_limco, SBT_1S, 0, lim_cb, p, C_PREL(1)); } int kern_setrlimit(struct thread *td, u_int which, struct rlimit *limp) { return (kern_proc_setrlimit(td, td->td_proc, which, limp)); } int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp) { struct plimit *newlim, *oldlim; register struct rlimit *alimp; struct rlimit oldssiz; int error; if (which >= RLIM_NLIMITS) return (EINVAL); /* * Preserve historical bugs by treating negative limits as unsigned. */ if (limp->rlim_cur < 0) limp->rlim_cur = RLIM_INFINITY; if (limp->rlim_max < 0) limp->rlim_max = RLIM_INFINITY; oldssiz.rlim_cur = 0; newlim = lim_alloc(); PROC_LOCK(p); oldlim = p->p_limit; alimp = &oldlim->pl_rlimit[which]; if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) { PROC_UNLOCK(p); lim_free(newlim); return (error); } if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; lim_copy(newlim, oldlim); alimp = &newlim->pl_rlimit[which]; switch (which) { case RLIMIT_CPU: if (limp->rlim_cur != RLIM_INFINITY && p->p_cpulimit == RLIM_INFINITY) callout_reset_sbt(&p->p_limco, SBT_1S, 0, lim_cb, p, C_PREL(1)); p->p_cpulimit = limp->rlim_cur; break; case RLIMIT_DATA: if (limp->rlim_cur > maxdsiz) limp->rlim_cur = maxdsiz; if (limp->rlim_max > maxdsiz) limp->rlim_max = maxdsiz; break; case RLIMIT_STACK: if (limp->rlim_cur > maxssiz) limp->rlim_cur = maxssiz; if (limp->rlim_max > maxssiz) limp->rlim_max = maxssiz; oldssiz = *alimp; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(&oldssiz, RLIMIT_STACK); break; case RLIMIT_NOFILE: if (limp->rlim_cur > maxfilesperproc) limp->rlim_cur = maxfilesperproc; if (limp->rlim_max > maxfilesperproc) limp->rlim_max = maxfilesperproc; break; case RLIMIT_NPROC: if (limp->rlim_cur > maxprocperuid) limp->rlim_cur = maxprocperuid; if (limp->rlim_max > maxprocperuid) limp->rlim_max = maxprocperuid; if (limp->rlim_cur < 1) limp->rlim_cur = 1; if (limp->rlim_max < 1) limp->rlim_max = 1; break; } if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(limp, which); *alimp = *limp; p->p_limit = newlim; PROC_UPDATE_COW(p); PROC_UNLOCK(p); lim_free(oldlim); if (which == RLIMIT_STACK && /* * Skip calls from exec_new_vmspace(), done when stack is * not mapped yet. */ (td != curthread || (p->p_flag & P_INEXEC) == 0)) { /* * Stack is allocated to the max at exec time with only * "rlim_cur" bytes accessible. If stack limit is going * up make more accessible, if going down make inaccessible. */ if (limp->rlim_cur != oldssiz.rlim_cur) { vm_offset_t addr; vm_size_t size; vm_prot_t prot; if (limp->rlim_cur > oldssiz.rlim_cur) { prot = p->p_sysent->sv_stackprot; size = limp->rlim_cur - oldssiz.rlim_cur; addr = p->p_sysent->sv_usrstack - limp->rlim_cur; } else { prot = VM_PROT_NONE; size = oldssiz.rlim_cur - limp->rlim_cur; addr = p->p_sysent->sv_usrstack - oldssiz.rlim_cur; } addr = trunc_page(addr); size = round_page(size); (void)vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, FALSE); } } return (0); } #ifndef _SYS_SYSPROTO_H_ struct __getrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* ARGSUSED */ int sys_getrlimit(struct thread *td, register struct __getrlimit_args *uap) { struct rlimit rlim; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); lim_rlimit(td, uap->which, &rlim); error = copyout(&rlim, uap->rlp, sizeof(struct rlimit)); return (error); } /* * Transform the running time and tick information for children of proc p * into user and system time usage. */ void calccru(struct proc *p, struct timeval *up, struct timeval *sp) { PROC_LOCK_ASSERT(p, MA_OWNED); calcru1(p, &p->p_crux, up, sp); } /* * Transform the running time and tick information in proc p into user * and system time usage. If appropriate, include the current time slice * on this CPU. */ void calcru(struct proc *p, struct timeval *up, struct timeval *sp) { struct thread *td; uint64_t runtime, u; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_STATLOCK_ASSERT(p, MA_OWNED); /* * If we are getting stats for the current process, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ td = curthread; if (td->td_proc == p) { u = cpu_ticks(); runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, u); } /* Make sure the per-thread stats are current. */ FOREACH_THREAD_IN_PROC(p, td) { if (td->td_incruntime == 0) continue; ruxagg(p, td); } calcru1(p, &p->p_rux, up, sp); } /* Collect resource usage for a single thread. */ void rufetchtd(struct thread *td, struct rusage *ru) { struct proc *p; uint64_t runtime, u; p = td->td_proc; PROC_STATLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If we are getting stats for the current thread, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ if (td == curthread) { u = cpu_ticks(); runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, u); } ruxagg(p, td); *ru = td->td_ru; calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime); } static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp) { /* {user, system, interrupt, total} {ticks, usec}: */ uint64_t ut, uu, st, su, it, tt, tu; ut = ruxp->rux_uticks; st = ruxp->rux_sticks; it = ruxp->rux_iticks; tt = ut + st + it; if (tt == 0) { /* Avoid divide by zero */ st = 1; tt = 1; } tu = cputick2usec(ruxp->rux_runtime); if ((int64_t)tu < 0) { /* XXX: this should be an assert /phk */ printf("calcru: negative runtime of %jd usec for pid %d (%s)\n", (intmax_t)tu, p->p_pid, p->p_comm); tu = ruxp->rux_tu; } if (tu >= ruxp->rux_tu) { /* * The normal case, time increased. * Enforce monotonicity of bucketed numbers. */ uu = (tu * ut) / tt; if (uu < ruxp->rux_uu) uu = ruxp->rux_uu; su = (tu * st) / tt; if (su < ruxp->rux_su) su = ruxp->rux_su; } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) { /* * When we calibrate the cputicker, it is not uncommon to * see the presumably fixed frequency increase slightly over * time as a result of thermal stabilization and NTP * discipline (of the reference clock). We therefore ignore * a bit of backwards slop because we expect to catch up * shortly. We use a 3 microsecond limit to catch low * counts and a 1% limit for high counts. */ uu = ruxp->rux_uu; su = ruxp->rux_su; tu = ruxp->rux_tu; } else { /* tu < ruxp->rux_tu */ /* * What happened here was likely that a laptop, which ran at * a reduced clock frequency at boot, kicked into high gear. * The wisdom of spamming this message in that case is * dubious, but it might also be indicative of something * serious, so lets keep it and hope laptops can be made * more truthful about their CPU speed via ACPI. */ printf("calcru: runtime went backwards from %ju usec " "to %ju usec for pid %d (%s)\n", (uintmax_t)ruxp->rux_tu, (uintmax_t)tu, p->p_pid, p->p_comm); uu = (tu * ut) / tt; su = (tu * st) / tt; } ruxp->rux_uu = uu; ruxp->rux_su = su; ruxp->rux_tu = tu; up->tv_sec = uu / 1000000; up->tv_usec = uu % 1000000; sp->tv_sec = su / 1000000; sp->tv_usec = su % 1000000; } #ifndef _SYS_SYSPROTO_H_ struct getrusage_args { int who; struct rusage *rusage; }; #endif int sys_getrusage(register struct thread *td, register struct getrusage_args *uap) { struct rusage ru; int error; error = kern_getrusage(td, uap->who, &ru); if (error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int kern_getrusage(struct thread *td, int who, struct rusage *rup) { struct proc *p; int error; error = 0; p = td->td_proc; PROC_LOCK(p); switch (who) { case RUSAGE_SELF: rufetchcalc(p, rup, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_CHILDREN: *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_THREAD: PROC_STATLOCK(p); thread_lock(td); rufetchtd(td, rup); thread_unlock(td); PROC_STATUNLOCK(p); break; default: error = EINVAL; } PROC_UNLOCK(p); return (error); } void rucollect(struct rusage *ru, struct rusage *ru2) { long *ip, *ip2; int i; if (ru->ru_maxrss < ru2->ru_maxrss) ru->ru_maxrss = ru2->ru_maxrss; ip = &ru->ru_first; ip2 = &ru2->ru_first; for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) *ip++ += *ip2++; } void ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2, struct rusage_ext *rux2) { rux->rux_runtime += rux2->rux_runtime; rux->rux_uticks += rux2->rux_uticks; rux->rux_sticks += rux2->rux_sticks; rux->rux_iticks += rux2->rux_iticks; rux->rux_uu += rux2->rux_uu; rux->rux_su += rux2->rux_su; rux->rux_tu += rux2->rux_tu; rucollect(ru, ru2); } /* * Aggregate tick counts into the proc's rusage_ext. */ static void ruxagg_locked(struct rusage_ext *rux, struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); PROC_STATLOCK_ASSERT(td->td_proc, MA_OWNED); rux->rux_runtime += td->td_incruntime; rux->rux_uticks += td->td_uticks; rux->rux_sticks += td->td_sticks; rux->rux_iticks += td->td_iticks; } void ruxagg(struct proc *p, struct thread *td) { thread_lock(td); ruxagg_locked(&p->p_rux, td); ruxagg_locked(&td->td_rux, td); td->td_incruntime = 0; td->td_uticks = 0; td->td_iticks = 0; td->td_sticks = 0; thread_unlock(td); } /* * Update the rusage_ext structure and fetch a valid aggregate rusage * for proc p if storage for one is supplied. */ void rufetch(struct proc *p, struct rusage *ru) { struct thread *td; PROC_STATLOCK_ASSERT(p, MA_OWNED); *ru = p->p_ru; if (p->p_numthreads > 0) { FOREACH_THREAD_IN_PROC(p, td) { ruxagg(p, td); rucollect(ru, &td->td_ru); } } } /* * Atomically perform a rufetch and a calcru together. * Consumers, can safely assume the calcru is executed only once * rufetch is completed. */ void rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up, struct timeval *sp) { PROC_STATLOCK(p); rufetch(p, ru); calcru(p, up, sp); PROC_STATUNLOCK(p); } /* * Allocate a new resource limits structure and initialize its * reference count and mutex pointer. */ struct plimit * lim_alloc() { struct plimit *limp; limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK); refcount_init(&limp->pl_refcnt, 1); return (limp); } struct plimit * lim_hold(struct plimit *limp) { refcount_acquire(&limp->pl_refcnt); return (limp); } void lim_fork(struct proc *p1, struct proc *p2) { PROC_LOCK_ASSERT(p1, MA_OWNED); PROC_LOCK_ASSERT(p2, MA_OWNED); p2->p_limit = lim_hold(p1->p_limit); callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0); if (p1->p_cpulimit != RLIM_INFINITY) callout_reset_sbt(&p2->p_limco, SBT_1S, 0, lim_cb, p2, C_PREL(1)); } void lim_free(struct plimit *limp) { if (refcount_release(&limp->pl_refcnt)) free((void *)limp, M_PLIMIT); } /* * Make a copy of the plimit structure. * We share these structures copy-on-write after fork. */ void lim_copy(struct plimit *dst, struct plimit *src) { KASSERT(dst->pl_refcnt <= 1, ("lim_copy to shared limit")); bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit)); } /* * Return the hard limit for a particular system resource. The * which parameter specifies the index into the rlimit array. */ rlim_t lim_max(struct thread *td, int which) { struct rlimit rl; lim_rlimit(td, which, &rl); return (rl.rlim_max); } rlim_t lim_max_proc(struct proc *p, int which) { struct rlimit rl; lim_rlimit_proc(p, which, &rl); return (rl.rlim_max); } /* * Return the current (soft) limit for a particular system resource. * The which parameter which specifies the index into the rlimit array */ rlim_t lim_cur(struct thread *td, int which) { struct rlimit rl; lim_rlimit(td, which, &rl); return (rl.rlim_cur); } rlim_t lim_cur_proc(struct proc *p, int which) { struct rlimit rl; lim_rlimit_proc(p, which, &rl); return (rl.rlim_cur); } /* * Return a copy of the entire rlimit structure for the system limit * specified by 'which' in the rlimit structure pointed to by 'rlp'. */ void lim_rlimit(struct thread *td, int which, struct rlimit *rlp) { struct proc *p = td->td_proc; MPASS(td == curthread); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = td->td_limit->pl_rlimit[which]; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(rlp, which); } void lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp) { PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = p->p_limit->pl_rlimit[which]; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(rlp, which); } void uihashinit() { uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); rw_init(&uihashtbl_lock, "uidinfo hash"); } /* * Look up a uidinfo struct for the parameter uid. * uihashtbl_lock must be locked. * Increase refcount on uidinfo struct returned. */ static struct uidinfo * uilookup(uid_t uid) { struct uihashhead *uipp; struct uidinfo *uip; rw_assert(&uihashtbl_lock, RA_LOCKED); uipp = UIHASH(uid); LIST_FOREACH(uip, uipp, ui_hash) if (uip->ui_uid == uid) { uihold(uip); break; } return (uip); } /* * Find or allocate a struct uidinfo for a particular uid. * Returns with uidinfo struct referenced. * uifree() should be called on a struct uidinfo when released. */ struct uidinfo * uifind(uid_t uid) { struct uidinfo *new_uip, *uip; rw_rlock(&uihashtbl_lock); uip = uilookup(uid); rw_runlock(&uihashtbl_lock); if (uip != NULL) return (uip); new_uip = malloc(sizeof(*new_uip), M_UIDINFO, M_WAITOK | M_ZERO); racct_create(&new_uip->ui_racct); refcount_init(&new_uip->ui_ref, 1); new_uip->ui_uid = uid; mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF); rw_wlock(&uihashtbl_lock); /* * There's a chance someone created our uidinfo while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate uidinfo. */ if ((uip = uilookup(uid)) == NULL) { LIST_INSERT_HEAD(UIHASH(uid), new_uip, ui_hash); rw_wunlock(&uihashtbl_lock); uip = new_uip; } else { rw_wunlock(&uihashtbl_lock); racct_destroy(&new_uip->ui_racct); mtx_destroy(&new_uip->ui_vmsize_mtx); free(new_uip, M_UIDINFO); } return (uip); } /* * Place another refcount on a uidinfo struct. */ void uihold(struct uidinfo *uip) { refcount_acquire(&uip->ui_ref); } /*- * Since uidinfo structs have a long lifetime, we use an * opportunistic refcounting scheme to avoid locking the lookup hash * for each release. * * If the refcount hits 0, we need to free the structure, * which means we need to lock the hash. * Optimal case: * After locking the struct and lowering the refcount, if we find * that we don't need to free, simply unlock and return. * Suboptimal case: * If refcount lowering results in need to free, bump the count * back up, lose the lock and acquire the locks in the proper * order to try again. */ void uifree(struct uidinfo *uip) { int old; /* Prepare for optimal case. */ old = uip->ui_ref; if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1)) return; /* Prepare for suboptimal case. */ rw_wlock(&uihashtbl_lock); if (refcount_release(&uip->ui_ref) == 0) { rw_wunlock(&uihashtbl_lock); return; } racct_destroy(&uip->ui_racct); LIST_REMOVE(uip, ui_hash); rw_wunlock(&uihashtbl_lock); if (uip->ui_sbsize != 0) printf("freeing uidinfo: uid = %d, sbsize = %ld\n", uip->ui_uid, uip->ui_sbsize); if (uip->ui_proccnt != 0) printf("freeing uidinfo: uid = %d, proccnt = %ld\n", uip->ui_uid, uip->ui_proccnt); if (uip->ui_vmsize != 0) printf("freeing uidinfo: uid = %d, swapuse = %lld\n", uip->ui_uid, (unsigned long long)uip->ui_vmsize); mtx_destroy(&uip->ui_vmsize_mtx); free(uip, M_UIDINFO); } #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct uidinfo *uip; struct uihashhead *uih; rw_rlock(&uihashtbl_lock); if (pre != NULL) (pre)(); for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) { LIST_FOREACH(uip, uih, ui_hash) { (callback)(uip->ui_racct, arg2, arg3); } } if (post != NULL) (post)(); rw_runlock(&uihashtbl_lock); } #endif static inline int chglimit(struct uidinfo *uip, long *limit, int diff, rlim_t max, const char *name) { /* Don't allow them to exceed max, but allow subtraction. */ if (diff > 0 && max != 0) { if (atomic_fetchadd_long(limit, (long)diff) + diff > max) { atomic_subtract_long(limit, (long)diff); return (0); } } else { atomic_add_long(limit, (long)diff); if (*limit < 0) printf("negative %s for uid = %d\n", name, uip->ui_uid); } return (1); } /* * Change the count associated with number of processes * a given user is using. When 'max' is 0, don't enforce a limit */ int chgproccnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_proccnt, diff, max, "proccnt")); } /* * Change the total socket buffer size a user has used. */ int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t max) { int diff, rv; diff = to - *hiwat; if (diff > 0 && max == 0) { rv = 0; } else { rv = chglimit(uip, &uip->ui_sbsize, diff, max, "sbsize"); if (rv != 0) *hiwat = to; } return (rv); } /* * Change the count associated with number of pseudo-terminals * a given user is using. When 'max' is 0, don't enforce a limit */ int chgptscnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_ptscnt, diff, max, "ptscnt")); } int chgkqcnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_kqcnt, diff, max, "kqcnt")); } + +int +chgumtxcnt(struct uidinfo *uip, int diff, rlim_t max) +{ + + return (chglimit(uip, &uip->ui_umtxcnt, diff, max, "umtxcnt")); +} Index: head/sys/kern/kern_umtx.c =================================================================== --- head/sys/kern/kern_umtx.c (revision 296161) +++ head/sys/kern/kern_umtx.c (revision 296162) @@ -1,3856 +1,4178 @@ /*- + * Copyright (c) 2015 The FreeBSD Foundation * Copyright (c) 2004, David Xu * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_umtx_profiling.h" #include #include +#include +#include +#include #include #include #include +#include #include #include #include +#include +#include +#include #include #include #include #include #include #include #include #include +#include #include #include +#include + #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #endif #define _UMUTEX_TRY 1 #define _UMUTEX_WAIT 2 #ifdef UMTX_PROFILING #define UPROF_PERC_BIGGER(w, f, sw, sf) \ (((w) > (sw)) || ((w) == (sw) && (f) > (sf))) #endif /* Priority inheritance mutex info. */ struct umtx_pi { /* Owner thread */ struct thread *pi_owner; /* Reference count */ int pi_refcount; /* List entry to link umtx holding by thread */ TAILQ_ENTRY(umtx_pi) pi_link; /* List entry in hash */ TAILQ_ENTRY(umtx_pi) pi_hashlink; /* List for waiters */ TAILQ_HEAD(,umtx_q) pi_blocked; /* Identify a userland lock object */ struct umtx_key pi_key; }; /* A userland synchronous object user. */ struct umtx_q { /* Linked list for the hash. */ TAILQ_ENTRY(umtx_q) uq_link; /* Umtx key. */ struct umtx_key uq_key; /* Umtx flags. */ int uq_flags; #define UQF_UMTXQ 0x0001 /* The thread waits on. */ struct thread *uq_thread; /* * Blocked on PI mutex. read can use chain lock * or umtx_lock, write must have both chain lock and * umtx_lock being hold. */ struct umtx_pi *uq_pi_blocked; /* On blocked list */ TAILQ_ENTRY(umtx_q) uq_lockq; /* Thread contending with us */ TAILQ_HEAD(,umtx_pi) uq_pi_contested; /* Inherited priority from PP mutex */ u_char uq_inherited_pri; /* Spare queue ready to be reused */ struct umtxq_queue *uq_spare_queue; /* The queue we on */ struct umtxq_queue *uq_cur_queue; }; TAILQ_HEAD(umtxq_head, umtx_q); /* Per-key wait-queue */ struct umtxq_queue { struct umtxq_head head; struct umtx_key key; LIST_ENTRY(umtxq_queue) link; int length; }; LIST_HEAD(umtxq_list, umtxq_queue); /* Userland lock object's wait-queue chain */ struct umtxq_chain { /* Lock for this chain. */ struct mtx uc_lock; /* List of sleep queues. */ struct umtxq_list uc_queue[2]; #define UMTX_SHARED_QUEUE 0 #define UMTX_EXCLUSIVE_QUEUE 1 LIST_HEAD(, umtxq_queue) uc_spare_queue; /* Busy flag */ char uc_busy; /* Chain lock waiters */ int uc_waiters; /* All PI in the list */ TAILQ_HEAD(,umtx_pi) uc_pi_list; #ifdef UMTX_PROFILING u_int length; u_int max_length; #endif }; #define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED) /* * Don't propagate time-sharing priority, there is a security reason, * a user can simply introduce PI-mutex, let thread A lock the mutex, * and let another thread B block on the mutex, because B is * sleeping, its priority will be boosted, this causes A's priority to * be boosted via priority propagating too and will never be lowered even * if it is using 100%CPU, this is unfair to other processes. */ #define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\ (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\ PRI_MAX_TIMESHARE : (td)->td_user_pri) #define GOLDEN_RATIO_PRIME 2654404609U #define UMTX_CHAINS 512 #define UMTX_SHIFTS (__WORD_BIT - 9) #define GET_SHARE(flags) \ (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE) #define BUSY_SPINS 200 struct abs_timeout { int clockid; struct timespec cur; struct timespec end; }; static uma_zone_t umtx_pi_zone; static struct umtxq_chain umtxq_chains[2][UMTX_CHAINS]; static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory"); static int umtx_pi_allocated; static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug"); SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD, &umtx_pi_allocated, 0, "Allocated umtx_pi"); #ifdef UMTX_PROFILING static long max_length; SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length"); static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats"); #endif +static void umtx_shm_init(void); static void umtxq_sysinit(void *); static void umtxq_hash(struct umtx_key *key); static struct umtxq_chain *umtxq_getchain(struct umtx_key *key); static void umtxq_lock(struct umtx_key *key); static void umtxq_unlock(struct umtx_key *key); static void umtxq_busy(struct umtx_key *key); static void umtxq_unbusy(struct umtx_key *key); static void umtxq_insert_queue(struct umtx_q *uq, int q); static void umtxq_remove_queue(struct umtx_q *uq, int q); static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *); static int umtxq_count(struct umtx_key *key); static struct umtx_pi *umtx_pi_alloc(int); static void umtx_pi_free(struct umtx_pi *pi); static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags); static void umtx_thread_cleanup(struct thread *td); static void umtx_exec_hook(void *arg __unused, struct proc *p __unused, struct image_params *imgp __unused); SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL); #define umtxq_signal(key, nwake) umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE) #define umtxq_insert(uq) umtxq_insert_queue((uq), UMTX_SHARED_QUEUE) #define umtxq_remove(uq) umtxq_remove_queue((uq), UMTX_SHARED_QUEUE) static struct mtx umtx_lock; #ifdef UMTX_PROFILING static void umtx_init_profiling(void) { struct sysctl_oid *chain_oid; char chain_name[10]; int i; for (i = 0; i < UMTX_CHAINS; ++i) { snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, chain_name, CTLFLAG_RD, NULL, "umtx hash stats"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL); } } static int sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS) { char buf[512]; struct sbuf sb; struct umtxq_chain *uc; u_int fract, i, j, tot, whole; u_int sf0, sf1, sf2, sf3, sf4; u_int si0, si1, si2, si3, si4; u_int sw0, sw1, sw2, sw3, sw4; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < 2; i++) { tot = 0; for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); tot += uc->max_length; mtx_unlock(&uc->uc_lock); } if (tot == 0) sbuf_printf(&sb, "%u) Empty ", i); else { sf0 = sf1 = sf2 = sf3 = sf4 = 0; si0 = si1 = si2 = si3 = si4 = 0; sw0 = sw1 = sw2 = sw3 = sw4 = 0; for (j = 0; j < UMTX_CHAINS; j++) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); whole = uc->max_length * 100; mtx_unlock(&uc->uc_lock); fract = (whole % tot) * 100; if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) { sf0 = fract; si0 = j; sw0 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw1, sf1)) { sf1 = fract; si1 = j; sw1 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw2, sf2)) { sf2 = fract; si2 = j; sw2 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw3, sf3)) { sf3 = fract; si3 = j; sw3 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw4, sf4)) { sf4 = fract; si4 = j; sw4 = whole; } } sbuf_printf(&sb, "queue %u:\n", i); sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot, sf0 / tot, si0); sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot, sf1 / tot, si1); sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot, sf2 / tot, si2); sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot, sf3 / tot, si3); sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot, sf4 / tot, si4); } } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } static int sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS) { struct umtxq_chain *uc; u_int i, j; int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (clear != 0) { for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); uc->length = 0; uc->max_length = 0; mtx_unlock(&uc->uc_lock); } } } return (0); } SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics"); SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length"); #endif static void umtxq_sysinit(void *arg __unused) { int i, j; umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL, MTX_DEF | MTX_DUPOK); LIST_INIT(&umtxq_chains[i][j].uc_queue[0]); LIST_INIT(&umtxq_chains[i][j].uc_queue[1]); LIST_INIT(&umtxq_chains[i][j].uc_spare_queue); TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list); umtxq_chains[i][j].uc_busy = 0; umtxq_chains[i][j].uc_waiters = 0; #ifdef UMTX_PROFILING umtxq_chains[i][j].length = 0; umtxq_chains[i][j].max_length = 0; #endif } } #ifdef UMTX_PROFILING umtx_init_profiling(); #endif mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF); EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL, EVENTHANDLER_PRI_ANY); + umtx_shm_init(); } struct umtx_q * umtxq_alloc(void) { struct umtx_q *uq; uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO); uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO); TAILQ_INIT(&uq->uq_spare_queue->head); TAILQ_INIT(&uq->uq_pi_contested); uq->uq_inherited_pri = PRI_MAX; return (uq); } void umtxq_free(struct umtx_q *uq) { MPASS(uq->uq_spare_queue != NULL); free(uq->uq_spare_queue, M_UMTX); free(uq, M_UMTX); } static inline void umtxq_hash(struct umtx_key *key) { unsigned n = (uintptr_t)key->info.both.a + key->info.both.b; key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS; } static inline struct umtxq_chain * umtxq_getchain(struct umtx_key *key) { if (key->type <= TYPE_SEM) return (&umtxq_chains[1][key->hash]); return (&umtxq_chains[0][key->hash]); } /* * Lock a chain. */ static inline void umtxq_lock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_lock(&uc->uc_lock); } /* * Unlock a chain. */ static inline void umtxq_unlock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_unlock(&uc->uc_lock); } /* * Set chain to busy state when following operation * may be blocked (kernel mutex can not be used). */ static inline void umtxq_busy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); if (uc->uc_busy) { #ifdef SMP if (smp_cpus > 1) { int count = BUSY_SPINS; if (count > 0) { umtxq_unlock(key); while (uc->uc_busy && --count > 0) cpu_spinwait(); umtxq_lock(key); } } #endif while (uc->uc_busy) { uc->uc_waiters++; msleep(uc, &uc->uc_lock, 0, "umtxqb", 0); uc->uc_waiters--; } } uc->uc_busy = 1; } /* * Unbusy a chain. */ static inline void umtxq_unbusy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); KASSERT(uc->uc_busy != 0, ("not busy")); uc->uc_busy = 0; if (uc->uc_waiters) wakeup_one(uc); } static inline void umtxq_unbusy_unlocked(struct umtx_key *key) { umtxq_lock(key); umtxq_unbusy(key); umtxq_unlock(key); } static struct umtxq_queue * umtxq_queue_lookup(struct umtx_key *key, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); LIST_FOREACH(uh, &uc->uc_queue[q], link) { if (umtx_key_match(&uh->key, key)) return (uh); } return (NULL); } static inline void umtxq_insert_queue(struct umtx_q *uq, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue")); uh = umtxq_queue_lookup(&uq->uq_key, q); if (uh != NULL) { LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link); } else { uh = uq->uq_spare_queue; uh->key = uq->uq_key; LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link); #ifdef UMTX_PROFILING uc->length++; if (uc->length > uc->max_length) { uc->max_length = uc->length; if (uc->max_length > max_length) max_length = uc->max_length; } #endif } uq->uq_spare_queue = NULL; TAILQ_INSERT_TAIL(&uh->head, uq, uq_link); uh->length++; uq->uq_flags |= UQF_UMTXQ; uq->uq_cur_queue = uh; return; } static inline void umtxq_remove_queue(struct umtx_q *uq, int q) { struct umtxq_chain *uc; struct umtxq_queue *uh; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); if (uq->uq_flags & UQF_UMTXQ) { uh = uq->uq_cur_queue; TAILQ_REMOVE(&uh->head, uq, uq_link); uh->length--; uq->uq_flags &= ~UQF_UMTXQ; if (TAILQ_EMPTY(&uh->head)) { KASSERT(uh->length == 0, ("inconsistent umtxq_queue length")); #ifdef UMTX_PROFILING uc->length--; #endif LIST_REMOVE(uh, link); } else { uh = LIST_FIRST(&uc->uc_spare_queue); KASSERT(uh != NULL, ("uc_spare_queue is empty")); LIST_REMOVE(uh, link); } uq->uq_spare_queue = uh; uq->uq_cur_queue = NULL; } } /* * Check if there are multiple waiters */ static int umtxq_count(struct umtx_key *key) { struct umtxq_chain *uc; struct umtxq_queue *uh; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) return (uh->length); return (0); } /* * Check if there are multiple PI waiters and returns first * waiter. */ static int umtxq_count_pi(struct umtx_key *key, struct umtx_q **first) { struct umtxq_chain *uc; struct umtxq_queue *uh; *first = NULL; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) { *first = TAILQ_FIRST(&uh->head); return (uh->length); } return (0); } static int umtxq_check_susp(struct thread *td) { struct proc *p; int error; /* * The check for TDF_NEEDSUSPCHK is racy, but it is enough to * eventually break the lockstep loop. */ if ((td->td_flags & TDF_NEEDSUSPCHK) == 0) return (0); error = 0; p = td->td_proc; PROC_LOCK(p); if (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) { if (p->p_flag & P_SINGLE_EXIT) error = EINTR; else error = ERESTART; } PROC_UNLOCK(p); return (error); } /* * Wake up threads waiting on an userland object. */ static int umtxq_signal_queue(struct umtx_key *key, int n_wake, int q) { struct umtxq_chain *uc; struct umtxq_queue *uh; struct umtx_q *uq; int ret; ret = 0; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, q); if (uh != NULL) { while ((uq = TAILQ_FIRST(&uh->head)) != NULL) { umtxq_remove_queue(uq, q); wakeup(uq); if (++ret >= n_wake) return (ret); } } return (ret); } /* * Wake up specified thread. */ static inline void umtxq_signal_thread(struct umtx_q *uq) { struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); umtxq_remove(uq); wakeup(uq); } static inline int tstohz(const struct timespec *tsp) { struct timeval tv; TIMESPEC_TO_TIMEVAL(&tv, tsp); return tvtohz(&tv); } static void abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute, const struct timespec *timeout) { timo->clockid = clockid; if (!absolute) { kern_clock_gettime(curthread, clockid, &timo->end); timo->cur = timo->end; timespecadd(&timo->end, timeout); } else { timo->end = *timeout; kern_clock_gettime(curthread, clockid, &timo->cur); } } static void abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime) { abs_timeout_init(timo, umtxtime->_clockid, (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout); } static inline void abs_timeout_update(struct abs_timeout *timo) { kern_clock_gettime(curthread, timo->clockid, &timo->cur); } static int abs_timeout_gethz(struct abs_timeout *timo) { struct timespec tts; if (timespeccmp(&timo->end, &timo->cur, <=)) return (-1); tts = timo->end; timespecsub(&tts, &timo->cur); return (tstohz(&tts)); } /* * Put thread into sleep state, before sleeping, check if * thread was removed from umtx queue. */ static inline int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime) { struct umtxq_chain *uc; int error, timo; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); for (;;) { if (!(uq->uq_flags & UQF_UMTXQ)) return (0); if (abstime != NULL) { timo = abs_timeout_gethz(abstime); if (timo < 0) return (ETIMEDOUT); } else timo = 0; error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo); if (error != EWOULDBLOCK) { umtxq_lock(&uq->uq_key); break; } if (abstime != NULL) abs_timeout_update(abstime); umtxq_lock(&uq->uq_key); } return (error); } /* * Convert userspace address into unique logical address. */ int umtx_key_get(const void *addr, int type, int share, struct umtx_key *key) { struct thread *td = curthread; vm_map_t map; vm_map_entry_t entry; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; key->type = type; if (share == THREAD_SHARE) { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } else { MPASS(share == PROCESS_SHARE || share == AUTO_SHARE); map = &td->td_proc->p_vmspace->vm_map; if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE, &entry, &key->info.shared.object, &pindex, &prot, &wired) != KERN_SUCCESS) { return (EFAULT); } if ((share == PROCESS_SHARE) || (share == AUTO_SHARE && VM_INHERIT_SHARE == entry->inheritance)) { key->shared = 1; key->info.shared.offset = (vm_offset_t)addr - entry->start + entry->offset; vm_object_reference(key->info.shared.object); } else { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } vm_map_lookup_done(map, entry); } umtxq_hash(key); return (0); } /* * Release key. */ void umtx_key_release(struct umtx_key *key) { if (key->shared) vm_object_deallocate(key->info.shared.object); } /* * Fetch and compare value, sleep on the address if value is not changed. */ static int do_wait(struct thread *td, void *addr, u_long id, struct _umtx_time *timeout, int compat32, int is_private) { struct abs_timeout timo; struct umtx_q *uq; u_long tmp; uint32_t tmp32; int error = 0; uq = td->td_umtxq; if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); if (compat32 == 0) { error = fueword(addr, &tmp); if (error != 0) error = EFAULT; } else { error = fueword32(addr, &tmp32); if (error == 0) tmp = tmp32; else error = EFAULT; } umtxq_lock(&uq->uq_key); if (error == 0) { if (tmp == id) error = umtxq_sleep(uq, "uwait", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else umtxq_remove(uq); } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { umtxq_remove(uq); } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } /* * Wake up threads sleeping on the specified address. */ int kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private) { struct umtx_key key; int ret; if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0) return (ret); umtxq_lock(&key); umtxq_signal(&key, n_wake); umtxq_unlock(&key); umtx_key_release(&key); return (0); } /* * Lock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int mode) { struct abs_timeout timo; struct umtx_q *uq; uint32_t owner, old, id; int error, rv; id = td->td_tid; uq = td->td_umtxq; error = 0; if (timeout != NULL) abs_timeout_init2(&timo, timeout); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { rv = fueword32(&m->m_owner, &owner); if (rv == -1) return (EFAULT); if (mode == _UMUTEX_WAIT) { if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED) return (0); } else { /* * Try the uncontested case. This should be done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) return (EFAULT); /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) return (0); /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED) { rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) return (EFAULT); if (owner == UMUTEX_CONTESTED) return (0); rv = umtxq_check_susp(td); if (rv != 0) return (rv); /* If this failed the lock has changed, restart. */ continue; } } if (mode == _UMUTEX_TRY) return (EBUSY); /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtxn", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0) error = umtxq_check_susp(td); } return (0); } /* * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; uint32_t owner, old, id; int error; int count; id = td->td_tid; /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); if ((owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED); if (error == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ error = casueword32(&m->m_owner, owner, &old, count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED); umtxq_lock(&key); umtxq_signal(&key,1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Check if the mutex is available and wake up a waiter, * only for simple mutex. */ static int do_wake_umutex(struct thread *td, struct umutex *m) { struct umtx_key key; uint32_t owner; uint32_t flags; int error; int count; error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != 0) return (0); error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); if (count <= 1) { error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, UMUTEX_UNOWNED); if (error == -1) error = EFAULT; } umtxq_lock(&key); if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } /* * Check if the mutex has waiters and tries to fix contention bit. */ static int do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; uint32_t owner, old; int type; int error; int count; switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: type = TYPE_NORMAL_UMUTEX; break; case UMUTEX_PRIO_INHERIT: type = TYPE_PI_UMUTEX; break; case UMUTEX_PRIO_PROTECT: type = TYPE_PP_UMUTEX; break; default: return (EINVAL); } if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0) return (error); owner = 0; umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * Only repair contention bit if there is a waiter, this means the mutex * is still being referenced by userland code, otherwise don't update * any memory. */ if (count > 1) { error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (old == owner) break; owner = old; error = umtxq_check_susp(td); if (error != 0) break; } } else if (count == 1) { error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 && (owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (old == owner) break; owner = old; error = umtxq_check_susp(td); if (error != 0) break; } } umtxq_lock(&key); if (error == EFAULT) { umtxq_signal(&key, INT_MAX); } else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static inline struct umtx_pi * umtx_pi_alloc(int flags) { struct umtx_pi *pi; pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags); TAILQ_INIT(&pi->pi_blocked); atomic_add_int(&umtx_pi_allocated, 1); return (pi); } static inline void umtx_pi_free(struct umtx_pi *pi) { uma_zfree(umtx_pi_zone, pi); atomic_add_int(&umtx_pi_allocated, -1); } /* * Adjust the thread's position on a pi_state after its priority has been * changed. */ static int umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td) { struct umtx_q *uq, *uq1, *uq2; struct thread *td1; mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (0); uq = td->td_umtxq; /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq); uq2 = TAILQ_NEXT(uq, uq_lockq); if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) || (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { td1 = uq1->uq_thread; MPASS(td1->td_proc->p_magic == P_MAGIC); if (UPRI(td1) > UPRI(td)) break; } if (uq1 == NULL) TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); else TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); } return (1); } static struct umtx_pi * umtx_pi_next(struct umtx_pi *pi) { struct umtx_q *uq_owner; if (pi->pi_owner == NULL) return (NULL); uq_owner = pi->pi_owner->td_umtxq; if (uq_owner == NULL) return (NULL); return (uq_owner->uq_pi_blocked); } /* * Floyd's Cycle-Finding Algorithm. */ static bool umtx_pi_check_loop(struct umtx_pi *pi) { struct umtx_pi *pi1; /* fast iterator */ mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (false); pi1 = pi; for (;;) { pi = umtx_pi_next(pi); if (pi == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; if (pi == pi1) return (true); } return (false); } /* * Propagate priority when a thread is blocked on POSIX * PI mutex. */ static void umtx_propagate_priority(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; int pri; mtx_assert(&umtx_lock, MA_OWNED); pri = UPRI(td); uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) return; if (umtx_pi_check_loop(pi)) return; for (;;) { td = pi->pi_owner; if (td == NULL || td == curthread) return; MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); thread_lock(td); if (td->td_lend_user_pri > pri) sched_lend_user_prio(td, pri); else { thread_unlock(td); break; } thread_unlock(td); /* * Pick up the lock that td is blocked on. */ uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) break; /* Resort td on the list if needed. */ umtx_pi_adjust_thread(pi, td); } } /* * Unpropagate priority for a PI mutex when a thread blocked on * it is interrupted by signal or resumed by others. */ static void umtx_repropagate_priority(struct umtx_pi *pi) { struct umtx_q *uq, *uq_owner; struct umtx_pi *pi2; int pri; mtx_assert(&umtx_lock, MA_OWNED); if (umtx_pi_check_loop(pi)) return; while (pi != NULL && pi->pi_owner != NULL) { pri = PRI_MAX; uq_owner = pi->pi_owner->td_umtxq; TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) { uq = TAILQ_FIRST(&pi2->pi_blocked); if (uq != NULL) { if (pri > UPRI(uq->uq_thread)) pri = UPRI(uq->uq_thread); } } if (pri > uq_owner->uq_inherited_pri) pri = uq_owner->uq_inherited_pri; thread_lock(pi->pi_owner); sched_lend_user_prio(pi->pi_owner, pri); thread_unlock(pi->pi_owner); if ((pi = uq_owner->uq_pi_blocked) != NULL) umtx_pi_adjust_thread(pi, uq_owner->uq_thread); } } /* * Insert a PI mutex into owned list. */ static void umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq_owner; uq_owner = owner->td_umtxq; mtx_assert(&umtx_lock, MA_OWNED); if (pi->pi_owner != NULL) panic("pi_owner != NULL"); pi->pi_owner = owner; TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link); } /* * Disown a PI mutex, and remove it from the owned list. */ static void umtx_pi_disown(struct umtx_pi *pi) { mtx_assert(&umtx_lock, MA_OWNED); TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link); pi->pi_owner = NULL; } /* * Claim ownership of a PI mutex. */ static int umtx_pi_claim(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq; mtx_lock(&umtx_lock); if (pi->pi_owner == owner) { mtx_unlock(&umtx_lock); return (0); } if (pi->pi_owner != NULL) { /* * userland may have already messed the mutex, sigh. */ mtx_unlock(&umtx_lock); return (EPERM); } umtx_pi_setowner(pi, owner); uq = TAILQ_FIRST(&pi->pi_blocked); if (uq != NULL) { int pri; pri = UPRI(uq->uq_thread); thread_lock(owner); if (pri < UPRI(owner)) sched_lend_user_prio(owner, pri); thread_unlock(owner); } mtx_unlock(&umtx_lock); return (0); } /* * Adjust a thread's order position in its blocked PI mutex, * this may result new priority propagating process. */ void umtx_pi_adjust(struct thread *td, u_char oldpri) { struct umtx_q *uq; struct umtx_pi *pi; uq = td->td_umtxq; mtx_lock(&umtx_lock); /* * Pick up the lock that td is blocked on. */ pi = uq->uq_pi_blocked; if (pi != NULL) { umtx_pi_adjust_thread(pi, td); umtx_repropagate_priority(pi); } mtx_unlock(&umtx_lock); } /* * Sleep on a PI mutex. */ static int umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner, const char *wmesg, struct abs_timeout *timo) { struct umtxq_chain *uc; struct thread *td, *td1; struct umtx_q *uq1; int pri; int error = 0; td = uq->uq_thread; KASSERT(td == curthread, ("inconsistent uq_thread")); uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(uc->uc_busy != 0, ("umtx chain is not busy")); umtxq_insert(uq); mtx_lock(&umtx_lock); if (pi->pi_owner == NULL) { mtx_unlock(&umtx_lock); /* XXX Only look up thread in current process. */ td1 = tdfind(owner, curproc->p_pid); mtx_lock(&umtx_lock); if (td1 != NULL) { if (pi->pi_owner == NULL) umtx_pi_setowner(pi, td1); PROC_UNLOCK(td1->td_proc); } } TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { pri = UPRI(uq1->uq_thread); if (pri > UPRI(td)) break; } if (uq1 != NULL) TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); else TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); uq->uq_pi_blocked = pi; thread_lock(td); td->td_flags |= TDF_UPIBLOCKED; thread_unlock(td); umtx_propagate_priority(td); mtx_unlock(&umtx_lock); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, wmesg, timo); umtxq_remove(uq); mtx_lock(&umtx_lock); uq->uq_pi_blocked = NULL; thread_lock(td); td->td_flags &= ~TDF_UPIBLOCKED; thread_unlock(td); TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); umtx_repropagate_priority(pi); mtx_unlock(&umtx_lock); umtxq_unlock(&uq->uq_key); return (error); } /* * Add reference count for a PI mutex. */ static void umtx_pi_ref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); pi->pi_refcount++; } /* * Decrease reference count for a PI mutex, if the counter * is decreased to zero, its memory space is freed. */ static void umtx_pi_unref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(pi->pi_refcount > 0, ("invalid reference count")); if (--pi->pi_refcount == 0) { mtx_lock(&umtx_lock); if (pi->pi_owner != NULL) umtx_pi_disown(pi); KASSERT(TAILQ_EMPTY(&pi->pi_blocked), ("blocked queue not empty")); mtx_unlock(&umtx_lock); TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink); umtx_pi_free(pi); } } /* * Find a PI mutex in hash table. */ static struct umtx_pi * umtx_pi_lookup(struct umtx_key *key) { struct umtxq_chain *uc; struct umtx_pi *pi; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) { if (umtx_key_match(&pi->pi_key, key)) { return (pi); } } return (NULL); } /* * Insert a PI mutex into hash table. */ static inline void umtx_pi_insert(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink); } /* * Lock a PI mutex. */ static int do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct abs_timeout timo; struct umtx_q *uq; struct umtx_pi *pi, *new_pi; uint32_t id, owner, old; int error, rv; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi == NULL) { new_pi = umtx_pi_alloc(M_NOWAIT); if (new_pi == NULL) { umtxq_unlock(&uq->uq_key); new_pi = umtx_pi_alloc(M_WAITOK); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi != NULL) { umtx_pi_free(new_pi); new_pi = NULL; } } if (new_pi != NULL) { new_pi->pi_key = uq->uq_key; umtx_pi_insert(new_pi); pi = new_pi; } } umtx_pi_ref(pi); umtxq_unlock(&uq->uq_key); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) { error = 0; break; } /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED) { rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); error = umtx_pi_claim(pi, td); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); if (error != 0) { /* * Since we're going to return an * error, restore the m_owner to its * previous, unowned state to avoid * compounding the problem. */ (void)casuword32(&m->m_owner, id | UMUTEX_CONTESTED, UMUTEX_CONTESTED); } break; } error = umtxq_check_susp(td); if (error != 0) break; /* If this failed the lock has changed, restart. */ continue; } if ((owner & ~UMUTEX_CONTESTED) == id) { error = EDEADLK; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } umtxq_lock(&uq->uq_key); /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ if (old == owner) { error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED, "umtxpi", timeout == NULL ? NULL : &timo); if (error != 0) continue; } else { umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } error = umtxq_check_susp(td); if (error != 0) break; } umtxq_lock(&uq->uq_key); umtx_pi_unref(pi); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PI mutex. */ static int do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; struct umtx_q *uq_first, *uq_first2, *uq_me; struct umtx_pi *pi, *pi2; uint32_t owner, old, id; int error; int count; int pri; id = td->td_tid; /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED); if (error == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count_pi(&key, &uq_first); if (uq_first != NULL) { mtx_lock(&umtx_lock); pi = uq_first->uq_pi_blocked; KASSERT(pi != NULL, ("pi == NULL?")); if (pi->pi_owner != td) { mtx_unlock(&umtx_lock); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); /* userland messed the mutex */ return (EPERM); } uq_me = td->td_umtxq; umtx_pi_disown(pi); /* get highest priority thread which is still sleeping. */ uq_first = TAILQ_FIRST(&pi->pi_blocked); while (uq_first != NULL && (uq_first->uq_flags & UQF_UMTXQ) == 0) { uq_first = TAILQ_NEXT(uq_first, uq_lockq); } pri = PRI_MAX; TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) { uq_first2 = TAILQ_FIRST(&pi2->pi_blocked); if (uq_first2 != NULL) { if (pri > UPRI(uq_first2->uq_thread)) pri = UPRI(uq_first2->uq_thread); } } thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); if (uq_first) umtxq_signal_thread(uq_first); } else { pi = umtx_pi_lookup(&key); /* * A umtx_pi can exist if a signal or timeout removed the * last waiter from the umtxq, but there is still * a thread in do_lock_pi() holding the umtx_pi. */ if (pi != NULL) { /* * The umtx_pi can be unowned, such as when a thread * has just entered do_lock_pi(), allocated the * umtx_pi, and unlocked the umtxq. * If the current thread owns it, it must disown it. */ mtx_lock(&umtx_lock); if (pi->pi_owner == td) umtx_pi_disown(pi); mtx_unlock(&umtx_lock); } } umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ error = casueword32(&m->m_owner, owner, &old, count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED); umtxq_unbusy_unlocked(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Lock a PP mutex. */ static int do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct abs_timeout timo; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t ceiling; uint32_t owner, id; int error, pri, old_inherited_pri, su, rv; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); for (;;) { old_inherited_pri = uq->uq_inherited_pri; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &ceiling); if (rv == -1) { error = EFAULT; goto out; } ceiling = RTP_PRIO_MAX - ceiling; if (ceiling > RTP_PRIO_MAX) { error = EINVAL; goto out; } mtx_lock(&umtx_lock); if (UPRI(td) < PRI_MIN_REALTIME + ceiling) { mtx_unlock(&umtx_lock); error = EINVAL; goto out; } if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) { uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling; thread_lock(td); if (uq->uq_inherited_pri < UPRI(td)) sched_lend_user_prio(td, uq->uq_inherited_pri); thread_unlock(td); } mtx_unlock(&umtx_lock); rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { error = 0; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } if (error != 0) { mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } out: umtxq_unbusy_unlocked(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PP mutex. */ static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t owner, id; uint32_t rceiling; int error, pri, new_inherited_pri, su; id = td->td_tid; uq = td->td_umtxq; su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t)); if (error != 0) return (error); if (rceiling == -1) new_inherited_pri = PRI_MAX; else { rceiling = RTP_PRIO_MAX - rceiling; if (rceiling > RTP_PRIO_MAX) return (EINVAL); new_inherited_pri = PRI_MIN_REALTIME + rceiling; } if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_unlock(&key); /* * For priority protected mutex, always set unlocked state * to UMUTEX_CONTESTED, so that userland always enters kernel * to lock the mutex, it is necessary because thread priority * has to be adjusted for such mutex. */ error = suword32(&m->m_owner, UMUTEX_CONTESTED); umtxq_lock(&key); if (error == 0) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); if (error == -1) error = EFAULT; else { mtx_lock(&umtx_lock); if (su != 0) uq->uq_inherited_pri = new_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } umtx_key_release(&key); return (error); } static int do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling, uint32_t *old_ceiling) { struct umtx_q *uq; uint32_t save_ceiling; uint32_t owner, id; uint32_t flags; int error, rv; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); if ((flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); if (ceiling > RTP_PRIO_MAX) return (EINVAL); id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); for (;;) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &save_ceiling); if (rv == -1) { error = EFAULT; break; } rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { suword32(&m->m_ceilings[0], ceiling); suword32(&m->m_owner, UMUTEX_CONTESTED); error = 0; break; } if ((owner & ~UMUTEX_CONTESTED) == id) { suword32(&m->m_ceilings[0], ceiling); error = 0; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", NULL); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } umtxq_lock(&uq->uq_key); if (error == 0) umtxq_signal(&uq->uq_key, INT_MAX); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0 && old_ceiling != NULL) suword32(old_ceiling, save_ceiling); return (error); } /* * Lock a userland POSIX mutex. */ static int do_lock_umutex(struct thread *td, struct umutex *m, struct _umtx_time *timeout, int mode) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: error = do_lock_normal(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_INHERIT: error = do_lock_pi(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_PROTECT: error = do_lock_pp(td, m, flags, timeout, mode); break; default: return (EINVAL); } if (timeout == NULL) { if (error == EINTR && mode != _UMUTEX_WAIT) error = ERESTART; } else { /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a userland POSIX mutex. */ static int do_unlock_umutex(struct thread *td, struct umutex *m) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: return (do_unlock_normal(td, m, flags)); case UMUTEX_PRIO_INHERIT: return (do_unlock_pi(td, m, flags)); case UMUTEX_PRIO_PROTECT: return (do_unlock_pp(td, m, flags)); } return (EINVAL); } static int do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m, struct timespec *timeout, u_long wflags) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, clockid, hasw; int error; uq = td->td_umtxq; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if ((wflags & CVWAIT_CLOCKID) != 0) { error = fueword32(&cv->c_clockid, &clockid); if (error == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (clockid < CLOCK_REALTIME || clockid >= CLOCK_THREAD_CPUTIME_ID) { /* hmm, only HW clock id will work. */ umtx_key_release(&uq->uq_key); return (EINVAL); } } else { clockid = CLOCK_REALTIME; } umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set c_has_waiters to 1 before releasing user mutex, also * don't modify cache line when unnecessary. */ error = fueword32(&cv->c_has_waiters, &hasw); if (error == 0 && hasw == 0) suword32(&cv->c_has_waiters, 1); umtxq_unbusy_unlocked(&uq->uq_key); error = do_unlock_umutex(td, m); if (timeout != NULL) abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0), timeout); umtxq_lock(&uq->uq_key); if (error == 0) { error = umtxq_sleep(uq, "ucond", timeout == NULL ? NULL : &timo); } if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { /* * This must be timeout,interrupted by signal or * surprious wakeup, clear c_has_waiter flag when * necessary. */ umtxq_busy(&uq->uq_key); if ((uq->uq_flags & UQF_UMTXQ) != 0) { int oldlen = uq->uq_cur_queue->length; umtxq_remove(uq); if (oldlen == 1) { umtxq_unlock(&uq->uq_key); suword32(&cv->c_has_waiters, 0); umtxq_lock(&uq->uq_key); } } umtxq_unbusy(&uq->uq_key); if (error == ERESTART) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland condition variable. */ static int do_cv_signal(struct thread *td, struct ucond *cv) { struct umtx_key key; int error, cnt, nwake; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); nwake = umtxq_signal(&key, 1); if (cnt <= nwake) { umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_lock(&key); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static int do_cv_broadcast(struct thread *td, struct ucond *cv) { struct umtx_key key; int error; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_signal(&key, INT_MAX); umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_unbusy_unlocked(&key); umtx_key_release(&key); return (error); } static int do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, wrflags; int32_t state, oldstate; int32_t blocked_readers; int error, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); wrflags = URWLOCK_WRITE_OWNER; if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER)) wrflags |= URWLOCK_WRITE_WAITERS; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } /* try to lock it */ while (!(state & wrflags)) { if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) { umtx_key_release(&uq->uq_key); return (EAGAIN); } rv = casueword32(&rwlock->rw_state, state, &oldstate, state + 1); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (oldstate == state) { umtx_key_release(&uq->uq_key); return (0); } error = umtxq_check_susp(td); if (error != 0) break; state = oldstate; } if (error) break; /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; /* set read contention bit */ while (error == 0 && (state & wrflags) && !(state & URWLOCK_READ_WAITERS)) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) goto sleep; state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } /* state is changed while setting flags, restart */ if (!(state & wrflags)) { umtxq_unbusy_unlocked(&uq->uq_key); error = umtxq_check_susp(td); if (error != 0) break; continue; } sleep: /* contention bit is set, before sleeping, increase read waiter count */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers+1); while (state & wrflags) { umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "urdlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } /* decrease read waiter count, and may clear read contention bit */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers-1); if (blocked_readers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; while (error == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) break; state = oldstate; error = umtxq_check_susp(td); } } umtxq_unbusy_unlocked(&uq->uq_key); if (error != 0) break; } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int32_t blocked_writers; int32_t blocked_readers; int error, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); blocked_readers = 0; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_OWNER); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (oldstate == state) { umtx_key_release(&uq->uq_key); return (0); } state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error) { if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) && blocked_readers != 0) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } break; } /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; while (error == 0 && ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) && (state & URWLOCK_WRITE_WAITERS) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) goto sleep; state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { umtxq_unbusy_unlocked(&uq->uq_key); error = umtxq_check_susp(td); if (error != 0) break; continue; } sleep: rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers+1); while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) { umtxq_lock(&uq->uq_key); umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "uwrlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers-1); if (blocked_writers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) break; state = oldstate; error = umtxq_check_susp(td); /* * We are leaving the URWLOCK_WRITE_WAITERS * behind, but this should not harm the * correctness. */ if (error != 0) break; } rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } } else blocked_readers = 0; umtxq_unbusy_unlocked(&uq->uq_key); } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_unlock(struct thread *td, struct urwlock *rwlock) { struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int error, rv, q, count; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); error = fueword32(&rwlock->rw_state, &state); if (error == -1) { error = EFAULT; goto out; } if (state & URWLOCK_WRITE_OWNER) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_OWNER); if (rv == -1) { error = EFAULT; goto out; } if (oldstate != state) { state = oldstate; if (!(oldstate & URWLOCK_WRITE_OWNER)) { error = EPERM; goto out; } error = umtxq_check_susp(td); if (error != 0) goto out; } else break; } } else if (URWLOCK_READER_COUNT(state) != 0) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state - 1); if (rv == -1) { error = EFAULT; goto out; } if (oldstate != state) { state = oldstate; if (URWLOCK_READER_COUNT(oldstate) == 0) { error = EPERM; goto out; } error = umtxq_check_susp(td); if (error != 0) goto out; } else break; } } else { error = EPERM; goto out; } count = 0; if (!(flags & URWLOCK_PREFER_READER)) { if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } else if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } } else { if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } else if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } } if (count) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, count, q); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } out: umtx_key_release(&uq->uq_key); return (error); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, count, count1; int error, rv; uq = td->td_umtxq; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = casueword32(&sem->_has_waiters, 0, &count1, 1); if (rv == 0) rv = fueword32(&sem->_count, &count); if (rv == -1 || count != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (rv == -1 ? EFAULT : 0); } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); /* A relative timeout cannot be restarted. */ if (error == ERESTART && timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem_wake(struct thread *td, struct _usem *sem) { struct umtx_key key; int error, cnt; uint32_t flags; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { umtxq_signal(&key, 1); /* * Check if count is greater than 0, this means the memory is * still being referenced by user code, so we can safely * update _has_waiters flag. */ if (cnt == 1) { umtxq_unlock(&key); error = suword32(&sem->_has_waiters, 0); umtxq_lock(&key); if (error == -1) error = EFAULT; } } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } #endif static int do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t count, flags; int error, rv; uq = td->td_umtxq; flags = fuword32(&sem->_flags); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = fueword32(&sem->_count, &count); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } for (;;) { if (USEM_COUNT(count) != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (0); } if (count == USEM_HAS_WAITERS) break; rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } if (count == 0) break; } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); /* A relative timeout cannot be restarted. */ if (error == ERESTART && timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem2_wake(struct thread *td, struct _usem2 *sem) { struct umtx_key key; int error, cnt, rv; uint32_t count, flags; rv = fueword32(&sem->_flags, &flags); if (rv == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { umtxq_signal(&key, 1); /* * If this was the last sleeping thread, clear the waiters * flag in _count. */ if (cnt == 1) { umtxq_unlock(&key); rv = fueword32(&sem->_count, &count); while (rv != -1 && count & USEM_HAS_WAITERS) rv = casueword32(&sem->_count, count, &count, count & ~USEM_HAS_WAITERS); if (rv == -1) error = EFAULT; umtxq_lock(&key); } } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } inline int umtx_copyin_timeout(const void *addr, struct timespec *tsp) { int error; error = copyin(addr, tsp, sizeof(struct timespec)); if (error == 0) { if (tsp->tv_sec < 0 || tsp->tv_nsec >= 1000000000 || tsp->tv_nsec < 0) error = EINVAL; } return (error); } static inline int umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp) { int error; if (size <= sizeof(struct timespec)) { tp->_clockid = CLOCK_REALTIME; tp->_flags = 0; error = copyin(addr, &tp->_timeout, sizeof(struct timespec)); } else error = copyin(addr, tp, sizeof(struct _umtx_time)); if (error != 0) return (error); if (tp->_timeout.tv_sec < 0 || tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0) return (EINVAL); return (0); } static int __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap) { return (EOPNOTSUPP); } static int __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return do_wait(td, uap->obj, uap->val, tm_p, 0, 0); } static int __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return do_wait(td, uap->obj, uap->val, tm_p, 1, 0); } static int __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return do_wait(td, uap->obj, uap->val, tm_p, 1, 1); } static int __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap) { return (kern_umtx_wake(td, uap->obj, uap->val, 0)); } #define BATCH_SIZE 128 static int __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap) { int count = uap->val; void *uaddrs[BATCH_SIZE]; char **upp = (char **)uap->obj; int tocopy; int error = 0; int i, pos = 0; while (count > 0) { tocopy = count; if (tocopy > BATCH_SIZE) tocopy = BATCH_SIZE; error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *)); if (error != 0) break; for (i = 0; i < tocopy; ++i) kern_umtx_wake(td, uaddrs[i], INT_MAX, 1); count -= tocopy; pos += tocopy; } return (error); } static int __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap) { return (kern_umtx_wake(td, uap->obj, uap->val, 1)); } static int __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return do_lock_umutex(td, uap->obj, tm_p, 0); } static int __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap) { return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY); } static int __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT); } static int __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap) { return do_wake_umutex(td, uap->obj); } static int __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap) { return do_unlock_umutex(td, uap->obj); } static int __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap) { return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1); } static int __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = umtx_copyin_timeout(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap) { return do_cv_signal(td, uap->obj); } static int __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap) { return do_cv_broadcast(td, uap->obj); } static int __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap) { return do_rw_unlock(td, uap->obj); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } static int __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap) { return (do_sem_wake(td, uap->obj)); } #endif static int __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_wake2_umutex(td, uap->obj, uap->val)); } static int __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem2_wait(td, uap->obj, tm_p)); } static int __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap) { return (do_sem2_wake(td, uap->obj)); } +#define USHM_OBJ_UMTX(o) \ + ((struct umtx_shm_obj_list *)(&(o)->umtx_data)) + +#define USHMF_REG_LINKED 0x0001 +#define USHMF_OBJ_LINKED 0x0002 +struct umtx_shm_reg { + TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link; + LIST_ENTRY(umtx_shm_reg) ushm_obj_link; + struct umtx_key ushm_key; + struct ucred *ushm_cred; + struct shmfd *ushm_obj; + u_int ushm_refcnt; + u_int ushm_flags; +}; + +LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg); +TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg); + +static uma_zone_t umtx_shm_reg_zone; +static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS]; +static struct mtx umtx_shm_lock; +static struct umtx_shm_reg_head umtx_shm_reg_delfree = + TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree); + +static void umtx_shm_free_reg(struct umtx_shm_reg *reg); + +static void +umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused) +{ + struct umtx_shm_reg_head d; + struct umtx_shm_reg *reg, *reg1; + + TAILQ_INIT(&d); + mtx_lock(&umtx_shm_lock); + TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link); + mtx_unlock(&umtx_shm_lock); + TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) { + TAILQ_REMOVE(&d, reg, ushm_reg_link); + umtx_shm_free_reg(reg); + } +} + +static struct task umtx_shm_reg_delfree_task = + TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL); + +static struct umtx_shm_reg * +umtx_shm_find_reg_locked(const struct umtx_key *key) +{ + struct umtx_shm_reg *reg; + struct umtx_shm_reg_head *reg_head; + + KASSERT(key->shared, ("umtx_p_find_rg: private key")); + mtx_assert(&umtx_shm_lock, MA_OWNED); + reg_head = &umtx_shm_registry[key->hash]; + TAILQ_FOREACH(reg, reg_head, ushm_reg_link) { + KASSERT(reg->ushm_key.shared, + ("non-shared key on reg %p %d", reg, reg->ushm_key.shared)); + if (reg->ushm_key.info.shared.object == + key->info.shared.object && + reg->ushm_key.info.shared.offset == + key->info.shared.offset) { + KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM")); + KASSERT(reg->ushm_refcnt > 0, + ("reg %p refcnt 0 onlist", reg)); + KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0, + ("reg %p not linked", reg)); + reg->ushm_refcnt++; + return (reg); + } + } + return (NULL); +} + +static struct umtx_shm_reg * +umtx_shm_find_reg(const struct umtx_key *key) +{ + struct umtx_shm_reg *reg; + + mtx_lock(&umtx_shm_lock); + reg = umtx_shm_find_reg_locked(key); + mtx_unlock(&umtx_shm_lock); + return (reg); +} + +static void +umtx_shm_free_reg(struct umtx_shm_reg *reg) +{ + + chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0); + crfree(reg->ushm_cred); + shm_drop(reg->ushm_obj); + uma_zfree(umtx_shm_reg_zone, reg); +} + +static bool +umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force) +{ + bool res; + + mtx_assert(&umtx_shm_lock, MA_OWNED); + KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg)); + reg->ushm_refcnt--; + res = reg->ushm_refcnt == 0; + if (res || force) { + if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) { + TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash], + reg, ushm_reg_link); + reg->ushm_flags &= ~USHMF_REG_LINKED; + } + if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) { + LIST_REMOVE(reg, ushm_obj_link); + reg->ushm_flags &= ~USHMF_OBJ_LINKED; + } + } + return (res); +} + +static void +umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force) +{ + vm_object_t object; + bool dofree; + + if (force) { + object = reg->ushm_obj->shm_object; + VM_OBJECT_WLOCK(object); + object->flags |= OBJ_UMTXDEAD; + VM_OBJECT_WUNLOCK(object); + } + mtx_lock(&umtx_shm_lock); + dofree = umtx_shm_unref_reg_locked(reg, force); + mtx_unlock(&umtx_shm_lock); + if (dofree) + umtx_shm_free_reg(reg); +} + +void +umtx_shm_object_init(vm_object_t object) +{ + + LIST_INIT(USHM_OBJ_UMTX(object)); +} + +void +umtx_shm_object_terminated(vm_object_t object) +{ + struct umtx_shm_reg *reg, *reg1; + bool dofree; + + dofree = false; + mtx_lock(&umtx_shm_lock); + LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) { + if (umtx_shm_unref_reg_locked(reg, true)) { + TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg, + ushm_reg_link); + dofree = true; + } + } + mtx_unlock(&umtx_shm_lock); + if (dofree) + taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task); +} + +static int +umtx_shm_create_reg(struct thread *td, const struct umtx_key *key, + struct umtx_shm_reg **res) +{ + struct umtx_shm_reg *reg, *reg1; + struct ucred *cred; + int error; + + reg = umtx_shm_find_reg(key); + if (reg != NULL) { + *res = reg; + return (0); + } + cred = td->td_ucred; + if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP))) + return (ENOMEM); + reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); + reg->ushm_refcnt = 1; + bcopy(key, ®->ushm_key, sizeof(*key)); + reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR); + reg->ushm_cred = crhold(cred); + error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); + if (error != 0) { + umtx_shm_free_reg(reg); + return (error); + } + mtx_lock(&umtx_shm_lock); + reg1 = umtx_shm_find_reg_locked(key); + if (reg1 != NULL) { + mtx_unlock(&umtx_shm_lock); + umtx_shm_free_reg(reg); + *res = reg1; + return (0); + } + reg->ushm_refcnt++; + TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link); + LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg, + ushm_obj_link); + reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED; + mtx_unlock(&umtx_shm_lock); + *res = reg; + return (0); +} + +static int +umtx_shm_alive(struct thread *td, void *addr) +{ + vm_map_t map; + vm_map_entry_t entry; + vm_object_t object; + vm_pindex_t pindex; + vm_prot_t prot; + int res, ret; + boolean_t wired; + + map = &td->td_proc->p_vmspace->vm_map; + res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry, + &object, &pindex, &prot, &wired); + if (res != KERN_SUCCESS) + return (EFAULT); + if (object == NULL) + ret = EINVAL; + else + ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0; + vm_map_lookup_done(map, entry); + return (ret); +} + +static void +umtx_shm_init(void) +{ + int i; + + umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF); + for (i = 0; i < nitems(umtx_shm_registry); i++) + TAILQ_INIT(&umtx_shm_registry[i]); +} + +static int +umtx_shm(struct thread *td, void *addr, u_int flags) +{ + struct umtx_key key; + struct umtx_shm_reg *reg; + struct file *fp; + int error, fd; + + if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP | + UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1) + return (EINVAL); + if ((flags & UMTX_SHM_ALIVE) != 0) + return (umtx_shm_alive(td, addr)); + error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key); + if (error != 0) + return (error); + KASSERT(key.shared == 1, ("non-shared key")); + if ((flags & UMTX_SHM_CREAT) != 0) { + error = umtx_shm_create_reg(td, &key, ®); + } else { + reg = umtx_shm_find_reg(&key); + if (reg == NULL) + error = ESRCH; + } + umtx_key_release(&key); + if (error != 0) + return (error); + KASSERT(reg != NULL, ("no reg")); + if ((flags & UMTX_SHM_DESTROY) != 0) { + umtx_shm_unref_reg(reg, true); + } else { +#if 0 +#ifdef MAC + error = mac_posixshm_check_open(td->td_ucred, + reg->ushm_obj, FFLAGS(O_RDWR)); + if (error == 0) +#endif + error = shm_access(reg->ushm_obj, td->td_ucred, + FFLAGS(O_RDWR)); + if (error == 0) +#endif + error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL); + if (error == 0) { + shm_hold(reg->ushm_obj); + finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj, + &shm_ops); + td->td_retval[0] = fd; + fdrop(fp, td); + } + } + umtx_shm_unref_reg(reg, false); + return (error); +} + +static int +__umtx_op_shm(struct thread *td, struct _umtx_op_args *uap) +{ + + return (umtx_shm(td, uap->uaddr1, uap->val)); +} + typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap); static const _umtx_op_func op_table[] = { [UMTX_OP_RESERVED0] = __umtx_op_unimpl, [UMTX_OP_RESERVED1] = __umtx_op_unimpl, [UMTX_OP_WAIT] = __umtx_op_wait, [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_LOCK] = __umtx_op_lock_umutex, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_uint, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, + [UMTX_OP_SHM] = __umtx_op_shm, }; int sys__umtx_op(struct thread *td, struct _umtx_op_args *uap) { if ((unsigned)uap->op < nitems(op_table)) return (*op_table[uap->op])(td, uap); return (EINVAL); } #ifdef COMPAT_FREEBSD32 struct timespec32 { int32_t tv_sec; int32_t tv_nsec; }; struct umtx_time32 { struct timespec32 timeout; uint32_t flags; uint32_t clockid; }; static inline int umtx_copyin_timeout32(void *addr, struct timespec *tsp) { struct timespec32 ts32; int error; error = copyin(addr, &ts32, sizeof(struct timespec32)); if (error == 0) { if (ts32.tv_sec < 0 || ts32.tv_nsec >= 1000000000 || ts32.tv_nsec < 0) error = EINVAL; else { tsp->tv_sec = ts32.tv_sec; tsp->tv_nsec = ts32.tv_nsec; } } return (error); } static inline int umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp) { struct umtx_time32 t32; int error; t32.clockid = CLOCK_REALTIME; t32.flags = 0; if (size <= sizeof(struct timespec32)) error = copyin(addr, &t32.timeout, sizeof(struct timespec32)); else error = copyin(addr, &t32, sizeof(struct umtx_time32)); if (error != 0) return (error); if (t32.timeout.tv_sec < 0 || t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0) return (EINVAL); tp->_timeout.tv_sec = t32.timeout.tv_sec; tp->_timeout.tv_nsec = t32.timeout.tv_nsec; tp->_flags = t32.flags; tp->_clockid = t32.clockid; return (0); } static int __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return do_wait(td, uap->obj, uap->val, tm_p, 1, 0); } static int __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return do_lock_umutex(td, uap->obj, tm_p, 0); } static int __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT); } static int __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = umtx_copyin_timeout32(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32( uap->uaddr2, (size_t)uap->uaddr1,&timeout); if (error != 0) return (error); tm_p = &timeout; } return do_wait(td, uap->obj, uap->val, tm_p, 1, 1); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } #endif static int __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem2_wait(td, uap->obj, tm_p)); } static int __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap) { int count = uap->val; uint32_t uaddrs[BATCH_SIZE]; uint32_t **upp = (uint32_t **)uap->obj; int tocopy; int error = 0; int i, pos = 0; while (count > 0) { tocopy = count; if (tocopy > BATCH_SIZE) tocopy = BATCH_SIZE; error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t)); if (error != 0) break; for (i = 0; i < tocopy; ++i) kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i], INT_MAX, 1); count -= tocopy; pos += tocopy; } return (error); } static const _umtx_op_func op_table_compat32[] = { [UMTX_OP_RESERVED0] = __umtx_op_unimpl, [UMTX_OP_RESERVED1] = __umtx_op_unimpl, [UMTX_OP_WAIT] = __umtx_op_wait_compat32, [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_LOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_lock_umutex_compat32, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait_compat32, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_compat32, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock_compat32, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock_compat32, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex_compat32, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait_compat32, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private32, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait_compat32, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, + [UMTX_OP_SHM] = __umtx_op_shm, }; int freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap) { if ((unsigned)uap->op < nitems(op_table_compat32)) { return (*op_table_compat32[uap->op])(td, (struct _umtx_op_args *)uap); } return (EINVAL); } #endif void umtx_thread_init(struct thread *td) { td->td_umtxq = umtxq_alloc(); td->td_umtxq->uq_thread = td; } void umtx_thread_fini(struct thread *td) { umtxq_free(td->td_umtxq); } /* * It will be called when new thread is created, e.g fork(). */ void umtx_thread_alloc(struct thread *td) { struct umtx_q *uq; uq = td->td_umtxq; uq->uq_inherited_pri = PRI_MAX; KASSERT(uq->uq_flags == 0, ("uq_flags != 0")); KASSERT(uq->uq_thread == td, ("uq_thread != td")); KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL")); KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty")); } /* * exec() hook. */ static void umtx_exec_hook(void *arg __unused, struct proc *p __unused, struct image_params *imgp __unused) { umtx_thread_cleanup(curthread); } /* * thread_exit() hook. */ void umtx_thread_exit(struct thread *td) { umtx_thread_cleanup(td); } /* * clean up umtx data. */ static void umtx_thread_cleanup(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; if ((uq = td->td_umtxq) == NULL) return; mtx_lock(&umtx_lock); uq->uq_inherited_pri = PRI_MAX; while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { pi->pi_owner = NULL; TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); } mtx_unlock(&umtx_lock); thread_lock(td); sched_lend_user_prio(td, PRI_MAX); thread_unlock(td); } Index: head/sys/kern/uipc_shm.c =================================================================== --- head/sys/kern/uipc_shm.c (revision 296161) +++ head/sys/kern/uipc_shm.c (revision 296162) @@ -1,1085 +1,1080 @@ /*- * Copyright (c) 2006, 2011 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2) and shm_unlink(2). While most of the implementation is * here, vm_mmap.c contains mapping logic changes. * * TODO: * * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1) * and ipcrm(1) be expanded or should new tools to manage both POSIX * kernel semaphores and POSIX shared memory be written? * * (2) Add support for this file type to fstat(1). * * (3) Resource limits? Does this need its own resource limits or are the * existing limits in mmap(2) sufficient? */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr *shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) -static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); -static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); static void shm_init(void *arg); -static void shm_drop(struct shmfd *shmfd); -static struct shmfd *shm_hold(struct shmfd *shmfd); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); -static int shm_dotruncate(struct shmfd *shmfd, off_t length); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; /* File descriptor operations. */ -static struct fileops shm_ops = { +struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; FEATURE(posix_shm, "POSIX shared memory"); static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); VM_OBJECT_WLOCK(obj); /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Parallel reads of the page content from disk are prevented * by exclusive busy. * * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(obj, idx, NULL, NULL)) { rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { printf( "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", obj, idx, m->valid, rv); vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); return (EIO); } } else vm_page_zero_invalid(m, TRUE); } vm_page_xunbusy(m); vm_page_lock(m); vm_page_hold(m); if (m->queue == PQ_NONE) { vm_page_deactivate(m); } else { /* Requeue to maintain LRU ordering. */ vm_page_requeue(m); } vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) { VM_OBJECT_WLOCK(obj); vm_page_dirty(m); vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(obj); } vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; foffset_lock_uio(fp, uio, flags); rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); if ((flags & FOF_OFFSET) == 0) { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); } error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } static int shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize; mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } -static int +int shm_dotruncate(struct shmfd *shmfd, off_t length) { vm_object_t object; vm_page_t m; vm_pindex_t idx, nobjsize; vm_ooffset_t delta; int base, rv; object = shmfd->shm_object; VM_OBJECT_WLOCK(object); if (length == shmfd->shm_size) { VM_OBJECT_WUNLOCK(object); return (0); } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) { VM_OBJECT_WUNLOCK(object); return (EBUSY); } /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(length); retry: m = vm_page_lookup(object, idx); if (m != NULL) { if (vm_page_sleep_if_busy(m, "shmtrc")) goto retry; } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL); if (m == NULL) { VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retry; } else if (m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); else /* A cached page was reactivated. */ rv = VM_PAGER_OK; vm_page_lock(m); if (rv == VM_PAGER_OK) { vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } else { vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_pager_page_unswapped(m); } } delta = ptoa(object->size - nobjsize); /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) swap_pager_freespace(object, nobjsize, delta); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { /* Attempt to reserve the swap */ delta = ptoa(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) { VM_OBJECT_WUNLOCK(object); return (ENOMEM); } object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; VM_OBJECT_WUNLOCK(object); return (0); } /* * shmfd object management including creation and reference counting * routines. */ -static struct shmfd * +struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode) { struct shmfd *shmfd; int ino; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); shmfd->shm_object->pg_color = 0; VM_OBJECT_WLOCK(shmfd->shm_object); vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); VM_OBJECT_WUNLOCK(shmfd->shm_object); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; ino = alloc_unr(shm_ino_unr); if (ino == -1) shmfd->shm_ino = 0; else shmfd->shm_ino = ino; refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } -static struct shmfd * +struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } -static void +void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); vm_object_deallocate(shmfd->shm_object); if (shmfd->shm_ino != 0) free_unr(shm_ino_unr, shmfd->shm_ino); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ -static int +int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred, NULL); mtx_unlock(&shm_timestamp_lock); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static void shm_init(void *arg) { mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL); KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized")); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); return (0); } } return (ENOENT); } int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, struct filecaps *fcaps) { struct filedesc *fdp; struct shmfd *shmfd; struct file *fp; char *path; Fnv32_t fnv; mode_t cmode; int fd, error; #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) return (ECAPMODE); #endif if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); fdp = td->td_proc->p_fd; cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps); if (error) return (error); /* A SHM_ANON path pointer creates an anonymous object. */ if (userpath == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode); } else { path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); error = copyinstr(userpath, path, MAXPATHLEN, NULL); #ifdef KTRACE if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (error == 0 && path[0] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_SHMFD); return (error); } fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode); shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif shm_dotruncate(shmfd, 0); } if (error == 0) shm_hold(shmfd); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* System calls. */ int sys_shm_open(struct thread *td, struct shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL)); } int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, path, MAXPATHLEN, NULL); if (error) { free(path, M_TEMP); return (error); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_TEMP); return (error); } int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; vm_prot_t maxprot; int error; shmfd = fp->f_data; maxprot = VM_PROT_NONE; /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; /* Don't permit shared writable mappings on read-only descriptors. */ if ((flags & MAP_SHARED) != 0 && (maxprot & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); maxprot &= cap_maxprot; #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) return (error); #endif /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (foff >= shmfd->shm_size || foff + objsize > round_page(shmfd->shm_size)) return (EINVAL); mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, shmfd->shm_object, foff, FALSE, td); if (error != 0) vm_object_deallocate(shmfd->shm_object); return (0); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct shmfd *shmfd; kif->kf_type = KF_TYPE_SHM; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; /* XXX */ mtx_unlock(&shm_timestamp_lock); kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { sx_slock(&shm_dict_lock); if (shmfd->shm_path != NULL) strlcpy(kif->kf_path, shmfd->shm_path, sizeof(kif->kf_path)); sx_sunlock(&shm_dict_lock); } return (0); } Index: head/sys/sys/mman.h =================================================================== --- head/sys/sys/mman.h (revision 296161) +++ head/sys/sys/mman.h (revision 296162) @@ -1,271 +1,278 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mman.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_MMAN_H_ #define _SYS_MMAN_H_ #include #include #if __BSD_VISIBLE /* * Inheritance for minherit() */ #define INHERIT_SHARE 0 #define INHERIT_COPY 1 #define INHERIT_NONE 2 #endif /* * Protections are chosen from these bits, or-ed together */ #define PROT_NONE 0x00 /* no permissions */ #define PROT_READ 0x01 /* pages can be read */ #define PROT_WRITE 0x02 /* pages can be written */ #define PROT_EXEC 0x04 /* pages can be executed */ /* * Flags contain sharing type and options. * Sharing types; choose one. */ #define MAP_SHARED 0x0001 /* share changes */ #define MAP_PRIVATE 0x0002 /* changes are private */ #if __BSD_VISIBLE #define MAP_COPY MAP_PRIVATE /* Obsolete */ #endif /* * Other flags */ #define MAP_FIXED 0x0010 /* map addr must be exactly as requested */ #if __BSD_VISIBLE #define MAP_RESERVED0020 0x0020 /* previously unimplemented MAP_RENAME */ #define MAP_RESERVED0040 0x0040 /* previously unimplemented MAP_NORESERVE */ #define MAP_RESERVED0080 0x0080 /* previously misimplemented MAP_INHERIT */ #define MAP_RESERVED0100 0x0100 /* previously unimplemented MAP_NOEXTEND */ #define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */ #define MAP_STACK 0x0400 /* region grows down, like a stack */ #define MAP_NOSYNC 0x0800 /* page to but do not sync underlying file */ /* * Mapping type */ #define MAP_FILE 0x0000 /* map from file (default) */ #define MAP_ANON 0x1000 /* allocated from memory, swap space */ #ifndef _KERNEL #define MAP_ANONYMOUS MAP_ANON /* For compatibility. */ #endif /* !_KERNEL */ /* * Extended flags */ #define MAP_EXCL 0x00004000 /* for MAP_FIXED, fail if address is used */ #define MAP_NOCORE 0x00020000 /* dont include these pages in a coredump */ #define MAP_PREFAULT_READ 0x00040000 /* prefault mapping for reading */ #ifdef __LP64__ #define MAP_32BIT 0x00080000 /* map in the low 2GB of address space */ #endif /* * Request specific alignment (n == log2 of the desired alignment). * * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does * not enforce a specific alignment. */ #define MAP_ALIGNED(n) ((n) << MAP_ALIGNMENT_SHIFT) #define MAP_ALIGNMENT_SHIFT 24 #define MAP_ALIGNMENT_MASK MAP_ALIGNED(0xff) #define MAP_ALIGNED_SUPER MAP_ALIGNED(1) /* align on a superpage */ #endif /* __BSD_VISIBLE */ #if __POSIX_VISIBLE >= 199309 /* * Process memory locking */ #define MCL_CURRENT 0x0001 /* Lock only current memory */ #define MCL_FUTURE 0x0002 /* Lock all future memory as well */ #endif /* * Error return from mmap() */ #define MAP_FAILED ((void *)-1) /* * msync() flags */ #define MS_SYNC 0x0000 /* msync synchronously */ #define MS_ASYNC 0x0001 /* return immediately */ #define MS_INVALIDATE 0x0002 /* invalidate all cached data */ /* * Advice to madvise */ #define _MADV_NORMAL 0 /* no further special treatment */ #define _MADV_RANDOM 1 /* expect random page references */ #define _MADV_SEQUENTIAL 2 /* expect sequential page references */ #define _MADV_WILLNEED 3 /* will need these pages */ #define _MADV_DONTNEED 4 /* dont need these pages */ #if __BSD_VISIBLE #define MADV_NORMAL _MADV_NORMAL #define MADV_RANDOM _MADV_RANDOM #define MADV_SEQUENTIAL _MADV_SEQUENTIAL #define MADV_WILLNEED _MADV_WILLNEED #define MADV_DONTNEED _MADV_DONTNEED #define MADV_FREE 5 /* dont need these pages, and junk contents */ #define MADV_NOSYNC 6 /* try to avoid flushes to physical media */ #define MADV_AUTOSYNC 7 /* revert to default flushing strategy */ #define MADV_NOCORE 8 /* do not include these pages in a core file */ #define MADV_CORE 9 /* revert to including pages in a core file */ #define MADV_PROTECT 10 /* protect process from pageout kill */ /* * Return bits from mincore */ #define MINCORE_INCORE 0x1 /* Page is incore */ #define MINCORE_REFERENCED 0x2 /* Page has been referenced by us */ #define MINCORE_MODIFIED 0x4 /* Page has been modified by us */ #define MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */ #define MINCORE_MODIFIED_OTHER 0x10 /* Page has been modified */ #define MINCORE_SUPER 0x20 /* Page is a "super" page */ /* * Anonymous object constant for shm_open(). */ #define SHM_ANON ((char *)1) #endif /* __BSD_VISIBLE */ /* * XXX missing POSIX_TYPED_MEM_* macros and * posix_typed_mem_info structure. */ #if __POSIX_VISIBLE >= 200112 #define POSIX_MADV_NORMAL _MADV_NORMAL #define POSIX_MADV_RANDOM _MADV_RANDOM #define POSIX_MADV_SEQUENTIAL _MADV_SEQUENTIAL #define POSIX_MADV_WILLNEED _MADV_WILLNEED #define POSIX_MADV_DONTNEED _MADV_DONTNEED #endif #ifndef _MODE_T_DECLARED typedef __mode_t mode_t; #define _MODE_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _SIZE_T_DECLARED typedef __size_t size_t; #define _SIZE_T_DECLARED #endif #if defined(_KERNEL) || defined(_WANT_FILE) #include #include #include #include #include struct file; struct shmfd { size_t shm_size; vm_object_t shm_object; int shm_refs; uid_t shm_uid; gid_t shm_gid; mode_t shm_mode; int shm_kmappings; /* * Values maintained solely to make this a better-behaved file * descriptor for fstat() to run on. */ struct timespec shm_atime; struct timespec shm_mtime; struct timespec shm_ctime; struct timespec shm_birthtime; ino_t shm_ino; struct label *shm_label; /* MAC label */ const char *shm_path; struct rangelock shm_rl; struct mtx shm_mtx; }; #endif #ifdef _KERNEL int shm_map(struct file *fp, size_t size, off_t offset, void **memp); int shm_unmap(struct file *fp, void *mem, size_t size); +int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); +struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); +struct shmfd *shm_hold(struct shmfd *shmfd); +void shm_drop(struct shmfd *shmfd); +int shm_dotruncate(struct shmfd *shmfd, off_t length); + +extern struct fileops shm_ops; #else /* !_KERNEL */ __BEGIN_DECLS /* * XXX not yet implemented: posix_mem_offset(), posix_typed_mem_get_info(), * posix_typed_mem_open(). */ #if __BSD_VISIBLE int getpagesizes(size_t *, int); int madvise(void *, size_t, int); int mincore(const void *, size_t, char *); int minherit(void *, size_t, int); #endif int mlock(const void *, size_t); #ifndef _MMAP_DECLARED #define _MMAP_DECLARED void * mmap(void *, size_t, int, int, int, off_t); #endif int mprotect(const void *, size_t, int); int msync(void *, size_t, int); int munlock(const void *, size_t); int munmap(void *, size_t); #if __POSIX_VISIBLE >= 200112 int posix_madvise(void *, size_t, int); #endif #if __POSIX_VISIBLE >= 199309 int mlockall(int); int munlockall(void); int shm_open(const char *, int, mode_t); int shm_unlink(const char *); #endif __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_MMAN_H_ */ Index: head/sys/sys/resource.h =================================================================== --- head/sys/sys/resource.h (revision 296161) +++ head/sys/sys/resource.h (revision 296162) @@ -1,185 +1,187 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)resource.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_RESOURCE_H_ #define _SYS_RESOURCE_H_ #include #include #include /* * Process priority specifications to get/setpriority. */ #define PRIO_MIN -20 #define PRIO_MAX 20 #define PRIO_PROCESS 0 #define PRIO_PGRP 1 #define PRIO_USER 2 /* * Resource utilization information. * * All fields are only modified by curthread and * no locks are required to read. */ #define RUSAGE_SELF 0 #define RUSAGE_CHILDREN -1 #define RUSAGE_THREAD 1 struct rusage { struct timeval ru_utime; /* user time used */ struct timeval ru_stime; /* system time used */ long ru_maxrss; /* max resident set size */ #define ru_first ru_ixrss long ru_ixrss; /* integral shared memory size */ long ru_idrss; /* integral unshared data " */ long ru_isrss; /* integral unshared stack " */ long ru_minflt; /* page reclaims */ long ru_majflt; /* page faults */ long ru_nswap; /* swaps */ long ru_inblock; /* block input operations */ long ru_oublock; /* block output operations */ long ru_msgsnd; /* messages sent */ long ru_msgrcv; /* messages received */ long ru_nsignals; /* signals received */ long ru_nvcsw; /* voluntary context switches */ long ru_nivcsw; /* involuntary " */ #define ru_last ru_nivcsw }; #if __BSD_VISIBLE struct __wrusage { struct rusage wru_self; struct rusage wru_children; }; #endif /* * Resource limits */ #define RLIMIT_CPU 0 /* maximum cpu time in seconds */ #define RLIMIT_FSIZE 1 /* maximum file size */ #define RLIMIT_DATA 2 /* data size */ #define RLIMIT_STACK 3 /* stack size */ #define RLIMIT_CORE 4 /* core file size */ #define RLIMIT_RSS 5 /* resident set size */ #define RLIMIT_MEMLOCK 6 /* locked-in-memory address space */ #define RLIMIT_NPROC 7 /* number of processes */ #define RLIMIT_NOFILE 8 /* number of open files */ #define RLIMIT_SBSIZE 9 /* maximum size of all socket buffers */ #define RLIMIT_VMEM 10 /* virtual process size (incl. mmap) */ #define RLIMIT_AS RLIMIT_VMEM /* standard name for RLIMIT_VMEM */ #define RLIMIT_NPTS 11 /* pseudo-terminals */ #define RLIMIT_SWAP 12 /* swap used */ #define RLIMIT_KQUEUES 13 /* kqueues allocated */ +#define RLIMIT_UMTXP 14 /* process-shared umtx */ -#define RLIM_NLIMITS 14 /* number of resource limits */ +#define RLIM_NLIMITS 15 /* number of resource limits */ #define RLIM_INFINITY ((rlim_t)(((uint64_t)1 << 63) - 1)) /* XXX Missing: RLIM_SAVED_MAX, RLIM_SAVED_CUR */ /* * Resource limit string identifiers */ #ifdef _RLIMIT_IDENT static const char *rlimit_ident[RLIM_NLIMITS] = { "cpu", "fsize", "data", "stack", "core", "rss", "memlock", "nproc", "nofile", "sbsize", "vmem", "npts", "swap", "kqueues", + "umtx", }; #endif #ifndef _RLIM_T_DECLARED typedef __rlim_t rlim_t; #define _RLIM_T_DECLARED #endif struct rlimit { rlim_t rlim_cur; /* current (soft) limit */ rlim_t rlim_max; /* maximum value for rlim_cur */ }; #if __BSD_VISIBLE struct orlimit { __int32_t rlim_cur; /* current (soft) limit */ __int32_t rlim_max; /* maximum value for rlim_cur */ }; struct loadavg { __fixpt_t ldavg[3]; long fscale; }; #define CP_USER 0 #define CP_NICE 1 #define CP_SYS 2 #define CP_INTR 3 #define CP_IDLE 4 #define CPUSTATES 5 #endif /* __BSD_VISIBLE */ #ifdef _KERNEL extern struct loadavg averunnable; void read_cpu_time(long *cp_time); /* Writes array of CPUSTATES */ #else __BEGIN_DECLS /* XXX 2nd arg to [gs]etpriority() should be an id_t */ int getpriority(int, int); int getrlimit(int, struct rlimit *); int getrusage(int, struct rusage *); int setpriority(int, int, int); int setrlimit(int, const struct rlimit *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_RESOURCE_H_ */ Index: head/sys/sys/resourcevar.h =================================================================== --- head/sys/sys/resourcevar.h (revision 296161) +++ head/sys/sys/resourcevar.h (revision 296162) @@ -1,164 +1,166 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)resourcevar.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_RESOURCEVAR_H_ #define _SYS_RESOURCEVAR_H_ #include #include #ifdef _KERNEL #include #include #endif /* * Kernel per-process accounting / statistics * (not necessarily resident except when running). * * Locking key: * b - created at fork, never changes * c - locked by proc mtx * k - only accessed by curthread * w - locked by proc itim lock * w2 - locked by proc prof lock */ struct pstats { #define pstat_startzero p_cru struct rusage p_cru; /* Stats for reaped children. */ struct itimerval p_timer[3]; /* (w) Virtual-time timers. */ #define pstat_endzero pstat_startcopy #define pstat_startcopy p_prof struct uprof { /* Profile arguments. */ caddr_t pr_base; /* (c + w2) Buffer base. */ u_long pr_size; /* (c + w2) Buffer size. */ u_long pr_off; /* (c + w2) PC offset. */ u_long pr_scale; /* (c + w2) PC scaling. */ } p_prof; #define pstat_endcopy p_start struct timeval p_start; /* (b) Starting time. */ }; #ifdef _KERNEL /* * Kernel shareable process resource limits. Because this structure * is moderately large but changes infrequently, it is normally * shared copy-on-write after forks. */ struct plimit { struct rlimit pl_rlimit[RLIM_NLIMITS]; int pl_refcnt; /* number of references */ }; struct racct; /*- * Per uid resource consumption. This structure is used to track * the total resource consumption (process count, socket buffer size, * etc) for the uid and impose limits. * * Locking guide: * (a) Constant from inception * (b) Lockless, updated using atomics * (c) Locked by global uihashtbl_lock * (d) Locked by the ui_vmsize_mtx */ struct uidinfo { LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */ struct mtx ui_vmsize_mtx; vm_ooffset_t ui_vmsize; /* (d) swap reservation by uid */ long ui_sbsize; /* (b) socket buffer space consumed */ long ui_proccnt; /* (b) number of processes */ long ui_ptscnt; /* (b) number of pseudo-terminals */ long ui_kqcnt; /* (b) number of kqueues */ + long ui_umtxcnt; /* (b) number of shared umtxs */ uid_t ui_uid; /* (a) uid */ u_int ui_ref; /* (b) reference count */ #ifdef RACCT struct racct *ui_racct; /* (a) resource accounting */ #endif }; #define UIDINFO_VMSIZE_LOCK(ui) mtx_lock(&((ui)->ui_vmsize_mtx)) #define UIDINFO_VMSIZE_UNLOCK(ui) mtx_unlock(&((ui)->ui_vmsize_mtx)) struct proc; struct rusage_ext; struct thread; void addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks); void addupc_task(struct thread *td, uintfptr_t pc, u_int ticks); void calccru(struct proc *p, struct timeval *up, struct timeval *sp); void calcru(struct proc *p, struct timeval *up, struct timeval *sp); int chgkqcnt(struct uidinfo *uip, int diff, rlim_t max); int chgproccnt(struct uidinfo *uip, int diff, rlim_t maxval); int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t maxval); int chgptscnt(struct uidinfo *uip, int diff, rlim_t maxval); +int chgumtxcnt(struct uidinfo *uip, int diff, rlim_t maxval); int fuswintr(void *base); int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp); struct plimit *lim_alloc(void); void lim_copy(struct plimit *dst, struct plimit *src); rlim_t lim_cur(struct thread *td, int which); rlim_t lim_cur_proc(struct proc *p, int which); void lim_fork(struct proc *p1, struct proc *p2); void lim_free(struct plimit *limp); struct plimit *lim_hold(struct plimit *limp); rlim_t lim_max(struct thread *td, int which); rlim_t lim_max_proc(struct proc *p, int which); void lim_rlimit(struct thread *td, int which, struct rlimit *rlp); void lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp); void ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2, struct rusage_ext *rux2); void rucollect(struct rusage *ru, struct rusage *ru2); void rufetch(struct proc *p, struct rusage *ru); void rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up, struct timeval *sp); void rufetchtd(struct thread *td, struct rusage *ru); void ruxagg(struct proc *p, struct thread *td); int suswintr(void *base, int word); struct uidinfo *uifind(uid_t uid); void uifree(struct uidinfo *uip); void uihashinit(void); void uihold(struct uidinfo *uip); #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3); #endif #endif /* _KERNEL */ #endif /* !_SYS_RESOURCEVAR_H_ */ Index: head/sys/sys/umtx.h =================================================================== --- head/sys/sys/umtx.h (revision 296161) +++ head/sys/sys/umtx.h (revision 296162) @@ -1,166 +1,174 @@ /*- * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef _SYS_UMTX_H_ #define _SYS_UMTX_H_ #include #define USYNC_PROCESS_SHARED 0x0001 /* Process shared sync objs */ #define UMUTEX_UNOWNED 0x0 #define UMUTEX_CONTESTED 0x80000000U #define UMUTEX_PRIO_INHERIT 0x0004 /* Priority inherited mutex */ #define UMUTEX_PRIO_PROTECT 0x0008 /* Priority protect mutex */ /* urwlock flags */ #define URWLOCK_PREFER_READER 0x0002 #define URWLOCK_WRITE_OWNER 0x80000000U #define URWLOCK_WRITE_WAITERS 0x40000000U #define URWLOCK_READ_WAITERS 0x20000000U #define URWLOCK_MAX_READERS 0x1fffffffU #define URWLOCK_READER_COUNT(c) ((c) & URWLOCK_MAX_READERS) /* _usem flags */ #define SEM_NAMED 0x0002 /* _usem2 count field */ #define USEM_HAS_WAITERS 0x80000000U #define USEM_MAX_COUNT 0x7fffffffU #define USEM_COUNT(c) ((c) & USEM_MAX_COUNT) /* op code for _umtx_op */ #define UMTX_OP_RESERVED0 0 #define UMTX_OP_RESERVED1 1 #define UMTX_OP_WAIT 2 #define UMTX_OP_WAKE 3 #define UMTX_OP_MUTEX_TRYLOCK 4 #define UMTX_OP_MUTEX_LOCK 5 #define UMTX_OP_MUTEX_UNLOCK 6 #define UMTX_OP_SET_CEILING 7 #define UMTX_OP_CV_WAIT 8 #define UMTX_OP_CV_SIGNAL 9 #define UMTX_OP_CV_BROADCAST 10 #define UMTX_OP_WAIT_UINT 11 #define UMTX_OP_RW_RDLOCK 12 #define UMTX_OP_RW_WRLOCK 13 #define UMTX_OP_RW_UNLOCK 14 #define UMTX_OP_WAIT_UINT_PRIVATE 15 #define UMTX_OP_WAKE_PRIVATE 16 #define UMTX_OP_MUTEX_WAIT 17 #define UMTX_OP_MUTEX_WAKE 18 /* deprecated */ #define UMTX_OP_SEM_WAIT 19 /* deprecated */ #define UMTX_OP_SEM_WAKE 20 /* deprecated */ #define UMTX_OP_NWAKE_PRIVATE 21 #define UMTX_OP_MUTEX_WAKE2 22 #define UMTX_OP_SEM2_WAIT 23 #define UMTX_OP_SEM2_WAKE 24 +#define UMTX_OP_SHM 25 /* Flags for UMTX_OP_CV_WAIT */ #define CVWAIT_CHECK_UNPARKING 0x01 #define CVWAIT_ABSTIME 0x02 #define CVWAIT_CLOCKID 0x04 #define UMTX_ABSTIME 0x01 #define UMTX_CHECK_UNPARKING CVWAIT_CHECK_UNPARKING +/* Flags for UMTX_OP_SHM */ +#define UMTX_SHM_CREAT 0x0001 +#define UMTX_SHM_LOOKUP 0x0002 +#define UMTX_SHM_DESTROY 0x0004 +#define UMTX_SHM_ALIVE 0x0008 + #ifndef _KERNEL int _umtx_op(void *obj, int op, u_long val, void *uaddr, void *uaddr2); #else /* * The umtx_key structure is used by both the Linux futex code and the * umtx implementation to map userland addresses to unique keys. */ enum { TYPE_SIMPLE_WAIT, TYPE_CV, TYPE_SEM, TYPE_SIMPLE_LOCK, TYPE_NORMAL_UMUTEX, TYPE_PI_UMUTEX, TYPE_PP_UMUTEX, TYPE_RWLOCK, - TYPE_FUTEX + TYPE_FUTEX, + TYPE_SHM, }; /* Key to represent a unique userland synchronous object */ struct umtx_key { int hash; int type; int shared; union { struct { struct vm_object *object; uintptr_t offset; } shared; struct { struct vmspace *vs; uintptr_t addr; } private; struct { void *a; uintptr_t b; } both; } info; }; #define THREAD_SHARE 0 #define PROCESS_SHARE 1 #define AUTO_SHARE 2 struct thread; static inline int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2) { return (k1->type == k2->type && k1->info.both.a == k2->info.both.a && k1->info.both.b == k2->info.both.b); } int umtx_copyin_timeout(const void *, struct timespec *); int umtx_key_get(const void *, int, int, struct umtx_key *); void umtx_key_release(struct umtx_key *); struct umtx_q *umtxq_alloc(void); void umtxq_free(struct umtx_q *); int kern_umtx_wake(struct thread *, void *, int, int); void umtx_pi_adjust(struct thread *, u_char); void umtx_thread_init(struct thread *); void umtx_thread_fini(struct thread *); void umtx_thread_alloc(struct thread *); void umtx_thread_exit(struct thread *); #endif /* !_KERNEL */ #endif /* !_SYS_UMTX_H_ */ Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 296161) +++ head/sys/vm/vm_object.c (revision 296162) @@ -1,2627 +1,2632 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags); static void vm_object_qcollapse(vm_object_t object); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static long object_collapses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, 0, "VM object collapses"); static long object_bypasses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, 0, "VM object bypasses"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(vm_object_cache_is_empty(object), ("object %p has cached pages", object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; object->ref_count = 0; object->rtree.rt_root = 0; object->rtree.rt_flags = 0; object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; object->cache.rt_root = 0; object->cache.rt_flags = 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: case OBJT_SWAP: object->flags = OBJ_ONEMAPPING; break; case OBJT_DEVICE: case OBJT_SG: object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: object->flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: object->flags = OBJ_UNMANAGED; break; case OBJT_VNODE: object->flags = 0; break; default: panic("_vm_object_allocate: type %d is undefined", type); } object->size = size; object->generation = 1; object->ref_count = 1; object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif + umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif rw_init(&kmem_object->lock, "kmem vm object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); #if VM_NRESERVLEVEL > 0 kmem_object->flags |= OBJ_COLORED; kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_init(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PVM, waitid, 0); } } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, object); return (object); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; VM_OBJECT_WLOCK(object); vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vprint("vm_object_vndeallocate", vp); panic("vm_object_vndeallocate: bad object reference count"); } #endif + if (object->ref_count == 1) + umtx_shm_object_terminated(object); + /* * The test for text of vp vnode does not need a bypass to * reach right VV_TEXT there, since it is obtained from * object->handle. */ if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) { object->ref_count--; VM_OBJECT_WUNLOCK(object); /* vrele may need the vnode lock. */ vrele(vp); } else { vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vdrop(vp); VM_OBJECT_WLOCK(object); object->ref_count--; if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); } else { if (object->ref_count == 0) VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); vput(vp); } } } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; struct vnode *vp; while (object != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); return; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_WUNLOCK(object); return; } else if (object->ref_count == 1) { if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD || object->ref_count != 1) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); vdrop(vp); return; } if ((object->flags & OBJ_TMPFS) != 0) VOP_UNSET_TEXT(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } if (object->shadow_count == 0 && object->handle == NULL && (object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS_NODE) == 0))) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object %p", object)); if (!VM_OBJECT_TRYWLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_WUNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ if ((robject->flags & OBJ_DEAD) == 0 && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_WUNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else if (object->paging_in_progress) { VM_OBJECT_WUNLOCK(robject); object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PDROP | PVM, "objde2", 0); VM_OBJECT_WLOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else VM_OBJECT_WUNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); continue; } VM_OBJECT_WUNLOCK(robject); } VM_OBJECT_WUNLOCK(object); return; } doterm: + umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object 2 %p", object)); VM_OBJECT_WLOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; VM_OBJECT_WUNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_WUNLOCK(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(object); vinvalbuf(vp, V_SAVE, 0, 0); VM_OBJECT_WLOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); vm_page_lock(p); /* * Optimize the page's removal from the object by resetting * its "object" field. Specifically, if the page is not * wired, then the effect of this assignment is that * vm_page_free()'s call to vm_page_remove() will return * immediately without modifying the page or the object. */ p->object = NULL; if (p->wire_count == 0) { vm_page_free(p); PCPU_INC(cnt.v_pfree); } vm_page_unlock(p); } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif if (__predict_false(!vm_object_cache_is_empty(object))) vm_page_cache_free(object, 0, 0); KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags) { /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) { *clearobjflags = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t clearobjflags, eio, res; VM_OBJECT_ASSERT_WLOCKED(object); /* * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE * objects. The check below prevents the function from * operating on non-vnode objects. */ if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); clearobjflags = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (p->valid == 0) continue; if (vm_page_sleep_if_busy(p, "vpcwai")) { if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &clearobjflags)) continue; n = vm_object_page_collect_flush(object, p, pagerflags, flags, &clearobjflags, &eio); if (eio) { res = FALSE; clearobjflags = FALSE; } if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; clearobjflags = FALSE; } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif if (clearobjflags) vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY); return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0) { vp = object->handle; VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && OFF_TO_IDX(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advise) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m; if (object == NULL) return; VM_OBJECT_WLOCK(object); /* * Locate and adjust resident pages */ for (; pindex < end; pindex += 1) { relookup: tobject = object; tpindex = pindex; shadowlookup: /* * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages * and those pages must be OBJ_ONEMAPPING. */ if (advise == MADV_FREE) { if ((tobject->type != OBJT_DEFAULT && tobject->type != OBJT_SWAP) || (tobject->flags & OBJ_ONEMAPPING) == 0) { goto unlock_tobject; } } else if ((tobject->flags & OBJ_UNMANAGED) != 0) goto unlock_tobject; m = vm_page_lookup(tobject, tpindex); if (m == NULL && advise == MADV_WILLNEED) { /* * If the page is cached, reactivate it. */ m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED | VM_ALLOC_NOBUSY); } if (m == NULL) { /* * There may be swap even if there is no backing page */ if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); /* * next object */ backing_object = tobject->backing_object; if (backing_object == NULL) goto unlock_tobject; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; goto shadowlookup; } else if (m->valid != VM_PAGE_BITS_ALL) goto unlock_tobject; /* * If the page is not in a normal state, skip it. */ vm_page_lock(m); if (m->hold_count != 0 || m->wire_count != 0) { vm_page_unlock(m); goto unlock_tobject; } KASSERT((m->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", m)); if (vm_page_busied(m)) { if (advise == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); } if (object != tobject) VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(tobject); vm_page_busy_sleep(m, "madvpo"); VM_OBJECT_WLOCK(object); goto relookup; } if (advise == MADV_WILLNEED) { vm_page_activate(m); } else { vm_page_advise(m, advise); } vm_page_unlock(m); if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); unlock_tobject: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_WLOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_WUNLOCK(source); return; } VM_OBJECT_WUNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, atop(length)); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif VM_OBJECT_WUNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(source); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_WLOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } if (orig_object->cred != NULL) { new_object->cred = orig_object->cred; crhold(orig_object->cred); new_object->charge = ptoa(size); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } retry: m = vm_page_find_least(orig_object, offidxstart); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_busied(m)) { VM_OBJECT_WUNLOCK(new_object); vm_page_lock(m); VM_OBJECT_WUNLOCK(orig_object); vm_page_busy_sleep(m, "spltwt"); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } /* vm_page_rename() will handle dirty and cache. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); VM_WAIT; VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif if (orig_object->type == OBJT_SWAP) vm_page_xbusy(m); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); /* * Transfer any cached pages from orig_object to new_object. * If swap_pager_copy() found swapped out pages within the * specified range of orig_object, then it changed * new_object's type to OBJT_SWAP when it transferred those * pages to new_object. Otherwise, new_object's type * should still be OBJT_DEFAULT and orig_object should not * contain any cached pages within the specified range. */ if (__predict_false(!vm_object_cache_is_empty(orig_object))) vm_page_cache_transfer(orig_object, offidxstart, new_object); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p)); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); if (p != NULL) vm_page_lock(p); VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); if (p == NULL) VM_WAIT; else vm_page_busy_sleep(p, "vmocol"); VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; /* * Initial conditions: * * We do not want to have to test for the existence of cache or swap * pages in the backing object. XXX but with the new swapper this * would be pretty easy to do. */ if (backing_object->type != OBJT_DEFAULT) return (false); backing_offset_index = OFF_TO_IDX(object->backing_object_offset); for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = TAILQ_NEXT(p, listq)) { new_pindex = p->pindex - backing_offset_index; /* * Ignore pages outside the parent object's range and outside * the parent object's mapping of the backing object. */ if (p->pindex < backing_offset_index || new_pindex >= object->size) continue; /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } return (true); } static bool vm_object_collapse_scan(vm_object_t object, int op) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_busied(p)) { next = vm_object_collapse_scan_wait(object, p, next, op); continue; } KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch")); if (p->pindex < backing_offset_index || new_pindex >= object->size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); /* * Page is out of the parent object's range, we can * simply destroy it. */ vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_busied(pp)) { /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. Therefore, we must either skip it * and the original (backing_object) page or wait for * its state to be finalized. * * This is due to a race with vm_fault() where we must * unbusy the original (backing_obj) page before we can * (re)lock the parent. Hence we can get here. */ next = vm_object_collapse_scan_wait(object, pp, next, op); continue; } KASSERT(pp == NULL || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will handle dirty and * cache. */ if (vm_page_rename(p, object, new_pindex)) { next = vm_object_collapse_scan_wait(object, NULL, next, op); continue; } /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif } return (true); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); if (backing_object->ref_count != 1) return; vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { vm_object_t backing_object; /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(backing_object); break; } if ( object->paging_in_progress != 0 || backing_object->paging_in_progress != 0 ) { vm_object_qcollapse(object); VM_OBJECT_WUNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy the source, it will change the * backing_object's type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); /* * Free any cached pages from backing_object. */ if (__predict_false( !vm_object_cache_is_empty(backing_object))) vm_page_cache_free(backing_object, 0, 0); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; if (backing_object->backing_object) { VM_OBJECT_WLOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ VM_OBJECT_WUNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); backing_object->type = OBJT_DEAD; backing_object->ref_count = 0; VM_OBJECT_WUNLOCK(backing_object); vm_object_destroy(backing_object); object_collapses++; } else { vm_object_t new_backing_object; /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_WLOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; vm_object_reference_locked(new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_WUNLOCK(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) goto skipmemq; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ vm_page_lock(p); if (vm_page_xbusied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopax"); VM_OBJECT_WLOCK(object); goto again; } if (p->wire_count != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { p->valid = 0; vm_page_undirty(p); } goto next; } if (vm_page_busied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopar"); VM_OBJECT_WLOCK(object); goto again; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_write(p); if (p->dirty) goto next; } if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); vm_page_free(p); next: vm_page_unlock(p); } vm_object_pip_wakeup(object); skipmemq: if (__predict_false(!vm_object_cache_is_empty(object))) vm_page_cache_free(object, start, end); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(p); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); break; } } /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || (prev_object->flags & OBJ_TMPFS_NODE) != 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * althought not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_VNODE) { if ((object->flags & OBJ_TMPFS_NODE) != 0) { KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs")); vm_object_set_flag(object, OBJ_TMPFS_DIRTY); } return; } object->generation++; if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) return; vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } vm_page_lock(tm); vm_page_unwire(tm, queue); vm_page_unlock(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (depth = 0; depth < locked_depth; depth++) { tobject = object->backing_object; VM_OBJECT_RUNLOCK(object); object = tobject; } } struct vnode * vm_object_vnode(vm_object_t object) { VM_OBJECT_ASSERT_LOCKED(object); if (object->type == OBJT_VNODE) return (object->handle); if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) return (object->un_pager.swp.swp_tmpfs); return (NULL); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo.kvo_size = ptoa(obj->size); kvo.kvo_resident = obj->resident_page_count; kvo.kvo_ref_count = obj->ref_count; kvo.kvo_shadow_count = obj->shadow_count; kvo.kvo_memattr = obj->memattr; kvo.kvo_active = 0; kvo.kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (m->queue == PQ_ACTIVE) kvo.kvo_active++; else if (m->queue == PQ_INACTIVE) kvo.kvo_inactive++; } kvo.kvo_vn_fileid = 0; kvo.kvo_vn_fsid = 0; freepath = NULL; fullpath = ""; vp = NULL; switch (obj->type) { case OBJT_DEFAULT: kvo.kvo_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kvo.kvo_type = KVME_TYPE_VNODE; vp = obj->handle; vref(vp); break; case OBJT_SWAP: kvo.kvo_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kvo.kvo_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kvo.kvo_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kvo.kvo_type = KVME_TYPE_DEAD; break; case OBJT_SG: kvo.kvo_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kvo.kvo_type = KVME_TYPE_MGTDEVICE; break; default: kvo.kvo_type = KVME_TYPE_UNKNOWN; break; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo.kvo_vn_fileid = va.va_fileid; kvo.kvo_vn_fsid = va.va_fsid; } vput(vp); } strlcpy(kvo.kvo_path, fullpath, sizeof(kvo.kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo.kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo.kvo_path) + 1; kvo.kvo_structsize = roundup(kvo.kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, &kvo, kvo.kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: head/sys/vm/vm_object.h =================================================================== --- head/sys/vm/vm_object.h (revision 296161) +++ head/sys/vm/vm_object.h (revision 296162) @@ -1,329 +1,334 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.h 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory object module definitions. */ #ifndef _VM_OBJECT_ #define _VM_OBJECT_ #include #include #include #include #include /* * Types defined: * * vm_object_t Virtual memory object. * * The root of cached pages pool is protected by both the per-object lock * and the free pages queue mutex. * On insert in the cache radix trie, the per-object lock is expected * to be already held and the free pages queue mutex will be * acquired during the operation too. * On remove and lookup from the cache radix trie, only the free * pages queue mutex is expected to be locked. * These rules allow for reliably checking for the presence of cached * pages with only the per-object lock held, thereby reducing contention * for the free pages queue mutex. * * List of locks * (c) const until freed * (o) per-object lock * (f) free pages queue mutex * */ struct vm_object { struct rwlock lock; TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ vm_memattr_t memattr; /* default memory attribute for pages */ objtype_t type; /* type of pager */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ struct vm_radix cache; /* (o + f) root of the cache page radix trie */ void *handle; union { /* * VNode pager * * vnp_size - current size of file */ struct { off_t vnp_size; vm_ooffset_t writemappings; } vnp; /* * Device pager * * devp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) devp_pglist; struct cdev_pager_ops *ops; struct cdev *dev; } devp; /* * SG pager * * sgp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) sgp_pglist; } sgp; /* * Swap pager * * swp_tmpfs - back-pointer to the tmpfs vnode, * if any, which uses the vm object * as backing store. The handle * cannot be reused for linking, * because the vnode can be * reclaimed and recreated, making * the handle changed and hash-chain * invalid. * * swp_bcount - number of swap 'swblock' metablocks, each * contains up to 16 swapblk assignments. * see vm/swap_pager.h */ struct { void *swp_tmpfs; int swp_bcount; } swp; } un_pager; struct ucred *cred; vm_ooffset_t charge; + void *umtx_data; }; /* * Flags */ #define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */ #define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */ #define OBJ_ACTIVE 0x0004 /* active objects */ #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ +#define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ #define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */ #define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) #ifdef _KERNEL #define OBJPC_SYNC 0x1 /* sync I/O */ #define OBJPC_INVAL 0x2 /* invalidate */ #define OBJPC_NOSYNC 0x4 /* skip if VPO_NOSYNC */ /* * The following options are supported by vm_object_page_remove(). */ #define OBJPR_CLEANONLY 0x1 /* Don't remove dirty pages. */ #define OBJPR_NOTMAPPED 0x2 /* Don't unmap pages. */ TAILQ_HEAD(object_q, vm_object); extern struct object_q vm_object_list; /* list of allocated objects */ extern struct mtx vm_object_list_mtx; /* lock for object list and count */ extern struct vm_object kernel_object_store; extern struct vm_object kmem_object_store; #define kernel_object (&kernel_object_store) #define kmem_object (&kmem_object_store) #define VM_OBJECT_ASSERT_LOCKED(object) \ rw_assert(&(object)->lock, RA_LOCKED) #define VM_OBJECT_ASSERT_RLOCKED(object) \ rw_assert(&(object)->lock, RA_RLOCKED) #define VM_OBJECT_ASSERT_WLOCKED(object) \ rw_assert(&(object)->lock, RA_WLOCKED) #define VM_OBJECT_ASSERT_UNLOCKED(object) \ rw_assert(&(object)->lock, RA_UNLOCKED) #define VM_OBJECT_LOCK_DOWNGRADE(object) \ rw_downgrade(&(object)->lock) #define VM_OBJECT_RLOCK(object) \ rw_rlock(&(object)->lock) #define VM_OBJECT_RUNLOCK(object) \ rw_runlock(&(object)->lock) #define VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo) \ rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo)) #define VM_OBJECT_TRYRLOCK(object) \ rw_try_rlock(&(object)->lock) #define VM_OBJECT_TRYWLOCK(object) \ rw_try_wlock(&(object)->lock) #define VM_OBJECT_TRYUPGRADE(object) \ rw_try_upgrade(&(object)->lock) #define VM_OBJECT_WLOCK(object) \ rw_wlock(&(object)->lock) #define VM_OBJECT_WOWNED(object) \ rw_wowned(&(object)->lock) #define VM_OBJECT_WUNLOCK(object) \ rw_wunlock(&(object)->lock) /* * The object must be locked or thread private. */ static __inline void vm_object_set_flag(vm_object_t object, u_short bits) { object->flags |= bits; } /* * Conditionally set the object's color, which (1) enables the allocation * of physical memory reservations for anonymous objects and larger-than- * superpage-sized named objects and (2) determines the first page offset * within the object at which a reservation may be allocated. In other * words, the color determines the alignment of the object with respect * to the largest superpage boundary. When mapping named objects, like * files or POSIX shared memory objects, the color should be set to zero * before a virtual address is selected for the mapping. In contrast, * for anonymous objects, the color may be set after the virtual address * is selected. * * The object must be locked. */ static __inline void vm_object_color(vm_object_t object, u_short color) { if ((object->flags & OBJ_COLORED) == 0) { object->pg_color = color; object->flags |= OBJ_COLORED; } } void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_subtract(vm_object_t object, short i); void vm_object_pip_wakeup(vm_object_t object); void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); static __inline boolean_t vm_object_cache_is_empty(vm_object_t object) { return (vm_radix_is_empty(&object->cache)); } + +void umtx_shm_object_init(vm_object_t object); +void umtx_shm_object_terminated(vm_object_t object); vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, boolean_t); void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_destroy (vm_object_t); void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); void vm_object_print(long addr, boolean_t have_addr, long count, char *modif); void vm_object_reference (vm_object_t); void vm_object_reference_locked(vm_object_t); int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr); void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t); void vm_object_split(vm_map_entry_t); boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t, boolean_t); void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue); struct vnode *vm_object_vnode(vm_object_t object); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ Index: head/usr.bin/limits/limits.c =================================================================== --- head/usr.bin/limits/limits.c (revision 296161) +++ head/usr.bin/limits/limits.c (revision 296162) @@ -1,773 +1,778 @@ /*- * Copyright (c) 1997 by * David L. Nugent * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, is permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. This work was done expressly for inclusion into FreeBSD. Other use * is permitted provided this notation is included. * 4. Absolutely no warranty of function or purpose is made by the authors. * 5. Modifications may be freely made to this file providing the above * conditions are met. * * Display/change(+runprogram)/eval resource limits. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include enum { SH_NONE, SH_SH, /* sh */ SH_CSH, /* csh */ SH_BASH, /* gnu bash */ SH_TCSH, /* tcsh */ SH_KSH, /* (pd)ksh */ SH_ZSH, /* zsh */ SH_RC, /* rc or es */ SH_NUMBER }; /* eval emitter for popular shells. * Why aren't there any standards here? Most shells support either * the csh 'limit' or sh 'ulimit' command, but each varies just * enough that they aren't very compatible from one to the other. */ static struct { const char * name; /* Name of shell */ const char * inf; /* Name used for 'unlimited' resource */ const char * cmd; /* Intro text */ const char * hard; /* Hard limit text */ const char * soft; /* Soft limit text */ const char * both; /* Hard+Soft limit text */ struct { const char * pfx; const char * sfx; int divisor; } lprm[RLIM_NLIMITS]; } shellparm[] = { { "", "infinity", "Resource limits%s%s:\n", "-max", "-cur", "", { { " cputime%-4s %8s", " secs\n", 1 }, { " filesize%-4s %8s", " kB\n", 1024 }, { " datasize%-4s %8s", " kB\n", 1024 }, { " stacksize%-4s %8s", " kB\n", 1024 }, { " coredumpsize%-4s %8s", " kB\n", 1024 }, { " memoryuse%-4s %8s", " kB\n", 1024 }, { " memorylocked%-4s %8s", " kB\n", 1024 }, { " maxprocesses%-4s %8s", "\n", 1 }, { " openfiles%-4s %8s", "\n", 1 }, { " sbsize%-4s %8s", " bytes\n", 1 }, { " vmemoryuse%-4s %8s", " kB\n", 1024 }, { " pseudo-terminals%-4s %8s", "\n", 1 }, { " swapuse%-4s %8s", " kB\n", 1024 }, { " kqueues%-4s %8s", "\n", 1 }, + { " umtxp%-4s %8s", "\n", 1 }, } }, { "sh", "unlimited", "", " -H", " -S", "", { { "ulimit%s -t %s", ";\n", 1 }, { "ulimit%s -f %s", ";\n", 512 }, { "ulimit%s -d %s", ";\n", 1024 }, { "ulimit%s -s %s", ";\n", 1024 }, { "ulimit%s -c %s", ";\n", 512 }, { "ulimit%s -m %s", ";\n", 1024 }, { "ulimit%s -l %s", ";\n", 1024 }, { "ulimit%s -u %s", ";\n", 1 }, { "ulimit%s -n %s", ";\n", 1 }, { "ulimit%s -b %s", ";\n", 1 }, { "ulimit%s -v %s", ";\n", 1024 }, { "ulimit%s -p %s", ";\n", 1 }, { "ulimit%s -w %s", ";\n", 1024 }, { "ulimit%s -k %s", ";\n", 1 }, + { "ulimit%s -o %s", ";\n", 1 }, } }, { "csh", "unlimited", "", " -h", "", NULL, { { "limit%s cputime %s", ";\n", 1 }, { "limit%s filesize %s", ";\n", 1024 }, { "limit%s datasize %s", ";\n", 1024 }, { "limit%s stacksize %s", ";\n", 1024 }, { "limit%s coredumpsize %s", ";\n", 1024 }, { "limit%s memoryuse %s", ";\n", 1024 }, { "limit%s memorylocked %s", ";\n", 1024 }, { "limit%s maxproc %s", ";\n", 1 }, { "limit%s openfiles %s", ";\n", 1 }, { "limit%s sbsize %s", ";\n", 1 }, { "limit%s vmemoryuse %s", ";\n", 1024 }, { "limit%s pseudoterminals %s", ";\n", 1 }, { "limit%s swapsize %s", ";\n", 1024 }, { "limit%s kqueues %s", ";\n", 1 }, + { "limit%s umtxp %s", ";\n", 1 }, } }, { "bash|bash2", "unlimited", "", " -H", " -S", "", { { "ulimit%s -t %s", ";\n", 1 }, { "ulimit%s -f %s", ";\n", 1024 }, { "ulimit%s -d %s", ";\n", 1024 }, { "ulimit%s -s %s", ";\n", 1024 }, { "ulimit%s -c %s", ";\n", 1024 }, { "ulimit%s -m %s", ";\n", 1024 }, { "ulimit%s -l %s", ";\n", 1024 }, { "ulimit%s -u %s", ";\n", 1 }, { "ulimit%s -n %s", ";\n", 1 }, { "ulimit%s -b %s", ";\n", 1 }, { "ulimit%s -v %s", ";\n", 1024 }, { "ulimit%s -p %s", ";\n", 1 }, { "ulimit%s -w %s", ";\n", 1024 } } }, { "tcsh", "unlimited", "", " -h", "", NULL, { { "limit%s cputime %s", ";\n", 1 }, { "limit%s filesize %s", ";\n", 1024 }, { "limit%s datasize %s", ";\n", 1024 }, { "limit%s stacksize %s", ";\n", 1024 }, { "limit%s coredumpsize %s", ";\n", 1024 }, { "limit%s memoryuse %s", ";\n", 1024 }, { "limit%s memorylocked %s", ";\n", 1024 }, { "limit%s maxproc %s", ";\n", 1 }, { "limit%s descriptors %s", ";\n", 1 }, { "limit%s sbsize %s", ";\n", 1 }, { "limit%s vmemoryuse %s", ";\n", 1024 }, { "limit%s pseudoterminals %s", ";\n", 1 }, { "limit%s swapsize %s", ";\n", 1024 }, { "limit%s kqueues %s", ";\n", 1 }, + { "limit%s umtxp %s", ";\n", 1 }, } }, { "ksh|pdksh", "unlimited", "", " -H", " -S", "", { { "ulimit%s -t %s", ";\n", 1 }, { "ulimit%s -f %s", ";\n", 512 }, { "ulimit%s -d %s", ";\n", 1024 }, { "ulimit%s -s %s", ";\n", 1024 }, { "ulimit%s -c %s", ";\n", 512 }, { "ulimit%s -m %s", ";\n", 1024 }, { "ulimit%s -l %s", ";\n", 1024 }, { "ulimit%s -p %s", ";\n", 1 }, { "ulimit%s -n %s", ";\n", 1 }, { "ulimit%s -b %s", ";\n", 1 }, { "ulimit%s -v %s", ";\n", 1024 }, { "ulimit%s -p %s", ";\n", 1 }, { "ulimit%s -w %s", ";\n", 1024 } } }, { "zsh", "unlimited", "", " -H", " -S", "", { { "ulimit%s -t %s", ";\n", 1 }, { "ulimit%s -f %s", ";\n", 512 }, { "ulimit%s -d %s", ";\n", 1024 }, { "ulimit%s -s %s", ";\n", 1024 }, { "ulimit%s -c %s", ";\n", 512 }, { "ulimit%s -m %s", ";\n", 1024 }, { "ulimit%s -l %s", ";\n", 1024 }, { "ulimit%s -u %s", ";\n", 1 }, { "ulimit%s -n %s", ";\n", 1 }, { "ulimit%s -b %s", ";\n", 1 }, { "ulimit%s -v %s", ";\n", 1024 }, { "ulimit%s -p %s", ";\n", 1 }, { "ulimit%s -w %s", ";\n", 1024 } } }, { "rc|es", "unlimited", "", " -h", "", NULL, { { "limit%s cputime %s", ";\n", 1 }, { "limit%s filesize %s", ";\n", 1024 }, { "limit%s datasize %s", ";\n", 1024 }, { "limit%s stacksize %s", ";\n", 1024 }, { "limit%s coredumpsize %s", ";\n", 1024 }, { "limit%s memoryuse %s", ";\n", 1024 }, { "limit%s lockedmemory %s", ";\n", 1024 }, { "limit%s processes %s", ";\n", 1 }, { "limit%s descriptors %s", ";\n", 1 }, { "limit%s sbsize %s", ";\n", 1 }, { "limit%s vmemoryuse %s", ";\n", 1024 }, { "limit%s pseudoterminals %s", ";\n", 1 }, { "limit%s swapuse %s", ";\n", 1024 } } }, { NULL, NULL, NULL, NULL, NULL, NULL, { } } }; static struct { const char * cap; rlim_t (*func)(login_cap_t *, const char *, rlim_t, rlim_t); } resources[RLIM_NLIMITS] = { { "cputime", login_getcaptime }, { "filesize", login_getcapsize }, { "datasize", login_getcapsize }, { "stacksize", login_getcapsize }, { "coredumpsize", login_getcapsize }, { "memoryuse", login_getcapsize }, { "memorylocked", login_getcapsize }, { "maxproc", login_getcapnum }, { "openfiles", login_getcapnum }, { "sbsize", login_getcapsize }, { "vmemoryuse", login_getcapsize }, { "pseudoterminals",login_getcapnum }, { "swapuse", login_getcapsize }, { "kqueues", login_getcapnum }, + { "umtxp", login_getcapnum }, }; /* * One letter for each resource levels. * NOTE: There is a dependency on the corresponding * letter index being equal to the resource number. * If sys/resource.h defines are changed, this needs * to be modified accordingly! */ #define RCS_STRING "tfdscmlunbvpwk" static rlim_t resource_num(int which, int ch, const char *str); static void usage(void); static int getshelltype(void); static void print_limit(rlim_t limit, unsigned divisor, const char *inf, const char *pfx, const char *sfx, const char *which); static void getrlimit_proc(pid_t pid, int resource, struct rlimit *rlp); static void setrlimit_proc(pid_t pid, int resource, const struct rlimit *rlp); extern char **environ; static const char rcs_string[] = RCS_STRING; int main(int argc, char *argv[]) { char *p, *cls = NULL; char *cleanenv[1]; struct passwd * pwd = NULL; int rcswhich, shelltype; int i, num_limits = 0; int ch, doeval = 0, doall = 0; int rtrn, setproc; login_cap_t * lc = NULL; enum { ANY=0, SOFT=1, HARD=2, BOTH=3, DISPLAYONLY=4 } type = ANY; enum { RCSUNKNOWN=0, RCSSET=1, RCSSEL=2 } todo = RCSUNKNOWN; int which_limits[RLIM_NLIMITS]; rlim_t set_limits[RLIM_NLIMITS]; struct rlimit limits[RLIM_NLIMITS]; pid_t pid; /* init resource tables */ for (i = 0; i < RLIM_NLIMITS; i++) { which_limits[i] = 0; /* Don't set/display any */ set_limits[i] = RLIM_INFINITY; } pid = -1; optarg = NULL; while ((ch = getopt(argc, argv, - ":EeC:U:BSHP:ab:c:d:f:l:m:n:s:t:u:v:p:w:k:")) != -1) { + ":EeC:U:BSHP:ab:c:d:f:l:m:n:s:t:u:v:p:w:k:o:")) != -1) { switch(ch) { case 'a': doall = 1; break; case 'E': environ = cleanenv; cleanenv[0] = NULL; break; case 'e': doeval = 1; break; case 'C': cls = optarg; break; case 'U': if ((pwd = getpwnam(optarg)) == NULL) { if (!isdigit(*optarg) || (pwd = getpwuid(atoi(optarg))) == NULL) { warnx("invalid user `%s'", optarg); usage(); } } break; case 'H': type = HARD; break; case 'S': type = SOFT; break; case 'B': type = SOFT|HARD; break; case 'P': if (!isdigit(*optarg) || (pid = atoi(optarg)) < 0) { warnx("invalid pid `%s'", optarg); usage(); } break; default: case ':': /* Without arg */ if ((p = strchr(rcs_string, optopt)) != NULL) { int rcswhich1 = p - rcs_string; if (optarg && *optarg == '-') { /* 'arg' is actually a switch */ --optind; /* back one arg, and make arg NULL */ optarg = NULL; } todo = optarg == NULL ? RCSSEL : RCSSET; if (type == ANY) type = BOTH; which_limits[rcswhich1] = optarg ? type : DISPLAYONLY; set_limits[rcswhich1] = resource_num(rcswhich1, optopt, optarg); num_limits++; break; } /* FALLTHRU */ case '?': usage(); } optarg = NULL; } if (pid != -1) { if (cls != NULL) { warnx("-C cannot be used with -P option"); usage(); } if (pwd != NULL) { warnx("-U cannot be used with -P option"); usage(); } } /* Get current resource values */ setproc = 0; for (i = 0; i < RLIM_NLIMITS; i++) { if (pid == -1) { getrlimit(i, &limits[i]); } else if (doall || num_limits == 0) { getrlimit_proc(pid, i, &limits[i]); } else if (which_limits[i] != 0) { getrlimit_proc(pid, i, &limits[i]); setproc = 1; } } /* If user was specified, get class from that */ if (pwd != NULL) lc = login_getpwclass(pwd); else if (cls != NULL && *cls != '\0') { lc = login_getclassbyname(cls, NULL); if (lc == NULL || strcmp(cls, lc->lc_class) != 0) fprintf(stderr, "login class '%s' non-existent, using %s\n", cls, lc?lc->lc_class:"current settings"); } /* If we have a login class, update resource table from that */ if (lc != NULL) { for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) { char str[40]; rlim_t val; /* current value overridden by resourcename or resourcename-cur */ sprintf(str, "%s-cur", resources[rcswhich].cap); val = resources[rcswhich].func(lc, resources[rcswhich].cap, limits[rcswhich].rlim_cur, limits[rcswhich].rlim_cur); limits[rcswhich].rlim_cur = resources[rcswhich].func(lc, str, val, val); /* maximum value overridden by resourcename or resourcename-max */ sprintf(str, "%s-max", resources[rcswhich].cap); val = resources[rcswhich].func(lc, resources[rcswhich].cap, limits[rcswhich].rlim_max, limits[rcswhich].rlim_max); limits[rcswhich].rlim_max = resources[rcswhich].func(lc, str, val, val); } } /* now, let's determine what we wish to do with all this */ argv += optind; /* If we're setting limits or doing an eval (ie. we're not just * displaying), then check that hard limits are not lower than * soft limits, and force rasing the hard limit if we need to if * we are raising the soft limit, or lower the soft limit if we * are lowering the hard limit. */ if ((*argv || doeval) && getuid() == 0) { for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) { if (limits[rcswhich].rlim_max != RLIM_INFINITY) { if (limits[rcswhich].rlim_cur == RLIM_INFINITY) { limits[rcswhich].rlim_max = RLIM_INFINITY; which_limits[rcswhich] |= HARD; } else if (limits[rcswhich].rlim_cur > limits[rcswhich].rlim_max) { if (which_limits[rcswhich] == SOFT) { limits[rcswhich].rlim_max = limits[rcswhich].rlim_cur; which_limits[rcswhich] |= HARD; } else if (which_limits[rcswhich] == HARD) { limits[rcswhich].rlim_cur = limits[rcswhich].rlim_max; which_limits[rcswhich] |= SOFT; } else { /* else.. if we're specifically setting both to * silly values, then let it error out. */ } } } } } /* See if we've overridden anything specific on the command line */ if (num_limits && todo == RCSSET) { for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) { if (which_limits[rcswhich] & HARD) limits[rcswhich].rlim_max = set_limits[rcswhich]; if (which_limits[rcswhich] & SOFT) limits[rcswhich].rlim_cur = set_limits[rcswhich]; } } /* If *argv is not NULL, then we are being asked to * (perhaps) set environment variables and run a program */ if (*argv) { if (doeval) { warnx("-e cannot be used with `cmd' option"); usage(); } if (pid != -1) { warnx("-P cannot be used with `cmd' option"); usage(); } login_close(lc); /* set leading environment variables, like eval(1) */ while (*argv && (p = strchr(*argv, '='))) { *p = '\0'; rtrn = setenv(*argv++, p + 1, 1); *p = '='; if (rtrn == -1) err(EXIT_FAILURE, "setenv %s", *argv); } /* Set limits */ for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) { if (doall || num_limits == 0 || which_limits[rcswhich] != 0) if (setrlimit(rcswhich, &limits[rcswhich]) == -1) err(1, "setrlimit %s", resources[rcswhich].cap); } if (*argv == NULL) usage(); execvp(*argv, argv); err(1, "%s", *argv); } if (setproc) { for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) { if (which_limits[rcswhich] != 0) setrlimit_proc(pid, rcswhich, &limits[rcswhich]); } exit(EXIT_SUCCESS); } shelltype = doeval ? getshelltype() : SH_NONE; if (type == ANY) /* Default to soft limits */ type = SOFT; /* Display limits */ printf(shellparm[shelltype].cmd, lc ? " for class " : " (current)", lc ? lc->lc_class : ""); for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) { if (doall || num_limits == 0 || which_limits[rcswhich] != 0) { if (which_limits[rcswhich] == ANY || which_limits[rcswhich]) which_limits[rcswhich] = type; if (shellparm[shelltype].lprm[rcswhich].pfx) { if (shellparm[shelltype].both && limits[rcswhich].rlim_cur == limits[rcswhich].rlim_max) { print_limit(limits[rcswhich].rlim_max, shellparm[shelltype].lprm[rcswhich].divisor, shellparm[shelltype].inf, shellparm[shelltype].lprm[rcswhich].pfx, shellparm[shelltype].lprm[rcswhich].sfx, shellparm[shelltype].both); } else { if (which_limits[rcswhich] & HARD) { print_limit(limits[rcswhich].rlim_max, shellparm[shelltype].lprm[rcswhich].divisor, shellparm[shelltype].inf, shellparm[shelltype].lprm[rcswhich].pfx, shellparm[shelltype].lprm[rcswhich].sfx, shellparm[shelltype].hard); } if (which_limits[rcswhich] & SOFT) { print_limit(limits[rcswhich].rlim_cur, shellparm[shelltype].lprm[rcswhich].divisor, shellparm[shelltype].inf, shellparm[shelltype].lprm[rcswhich].pfx, shellparm[shelltype].lprm[rcswhich].sfx, shellparm[shelltype].soft); } } } } } login_close(lc); exit(EXIT_SUCCESS); } static void usage(void) { (void)fprintf(stderr, "usage: limits [-C class|-P pid|-U user] [-eaSHBE] " "[-bcdflmnstuvpwk [val]] [[name=val ...] cmd]\n"); exit(EXIT_FAILURE); } static void print_limit(rlim_t limit, unsigned divisor, const char * inf, const char * pfx, const char * sfx, const char * which) { char numbr[64]; if (limit == RLIM_INFINITY) strcpy(numbr, inf); else sprintf(numbr, "%jd", (intmax_t)((limit + divisor/2) / divisor)); printf(pfx, which, numbr); printf(sfx, which); } static rlim_t resource_num(int which, int ch, const char *str) { rlim_t res = RLIM_INFINITY; if (str != NULL && !(strcasecmp(str, "inf") == 0 || strcasecmp(str, "infinity") == 0 || strcasecmp(str, "unlimit") == 0 || strcasecmp(str, "unlimited") == 0)) { const char * s = str; char *e; switch (which) { case RLIMIT_CPU: /* time values */ errno = 0; res = 0; while (*s) { rlim_t tim = strtoq(s, &e, 0); if (e == NULL || e == s || errno) break; switch (*e++) { case 0: /* end of string */ e--; default: case 's': case 'S': /* seconds */ break; case 'm': case 'M': /* minutes */ tim *= 60L; break; case 'h': case 'H': /* hours */ tim *= (60L * 60L); break; case 'd': case 'D': /* days */ tim *= (60L * 60L * 24L); break; case 'w': case 'W': /* weeks */ tim *= (60L * 60L * 24L * 7L); case 'y': case 'Y': /* Years */ tim *= (60L * 60L * 24L * 365L); } s = e; res += tim; } break; case RLIMIT_FSIZE: /* Size values */ case RLIMIT_DATA: case RLIMIT_STACK: case RLIMIT_CORE: case RLIMIT_RSS: case RLIMIT_MEMLOCK: case RLIMIT_SBSIZE: case RLIMIT_VMEM: case RLIMIT_SWAP: errno = 0; res = 0; while (*s) { rlim_t mult, tim = strtoq(s, &e, 0); if (e == NULL || e == s || errno) break; switch (*e++) { case 0: /* end of string */ e--; default: mult = 1; break; case 'b': case 'B': /* 512-byte blocks */ mult = 512; break; case 'k': case 'K': /* 1024-byte Kilobytes */ mult = 1024; break; case 'm': case 'M': /* 1024-k kbytes */ mult = 1024 * 1024; break; case 'g': case 'G': /* 1Gbyte */ mult = 1024 * 1024 * 1024; break; case 't': case 'T': /* 1TBte */ mult = 1024LL * 1024LL * 1024LL * 1024LL; break; } s = e; res += (tim * mult); } break; case RLIMIT_NPROC: case RLIMIT_NOFILE: case RLIMIT_NPTS: case RLIMIT_KQUEUES: res = strtoq(s, &e, 0); s = e; break; } if (*s) { warnx("invalid value -%c `%s'", ch, str); usage(); } } return res; } static int getshellbyname(const char * shell) { int i; const char * q; const char * p = strrchr(shell, '/'); p = p ? p+1 : shell; for (i = 0; (q = shellparm[i].name) != NULL; i++) { while (*q) { int j = strcspn(q, "|"); if (j == 0) break; if (strncmp(p, q, j) == 0) return i; if (*(q += j)) ++q; } } return SH_SH; } /* * Determine the type of shell our parent process is * This is quite tricky, not 100% reliable and probably * not nearly as thorough as it should be. Basically, this * is a "best guess" only, but hopefully will work in * most cases. */ static int getshelltype(void) { pid_t ppid = getppid(); if (ppid != 1) { struct kinfo_proc kp; struct stat st; char path[MAXPATHLEN]; char * shell = getenv("SHELL"); int mib[4]; size_t len; mib[0] = CTL_KERN; mib[1] = KERN_PROC; mib[3] = ppid; if (shell != NULL && stat(shell, &st) != -1) { struct stat st1; mib[2] = KERN_PROC_PATHNAME; len = sizeof(path); if (sysctl(mib, 4, path, &len, NULL, 0) != -1) { /* $SHELL is actual shell? */ if (stat(path, &st1) != -1 && memcmp(&st, &st1, sizeof st) == 0) return getshellbyname(shell); } } mib[2] = KERN_PROC_PID; len = sizeof(kp); if (sysctl(mib, 4, &kp, &len, NULL, 0) != -1) return getshellbyname(kp.ki_comm); } return SH_SH; } static void getrlimit_proc(pid_t pid, int resource, struct rlimit *rlp) { int error; int name[5]; size_t len; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_RLIMIT; name[3] = pid; name[4] = resource; len = sizeof(*rlp); error = sysctl(name, 5, rlp, &len, NULL, 0); if (error == -1) err(EXIT_FAILURE, "sysctl: kern.proc.rlimit: %d", pid); if (len != sizeof(*rlp)) errx(EXIT_FAILURE, "sysctl() returns wrong size"); } static void setrlimit_proc(pid_t pid, int resource, const struct rlimit *rlp) { int error; int name[5]; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_RLIMIT; name[3] = pid; name[4] = resource; error = sysctl(name, 5, NULL, 0, rlp, sizeof(*rlp)); if (error == -1) err(EXIT_FAILURE, "sysctl: kern.proc.rlimit: %d", pid); } Index: head/usr.bin/procstat/procstat_rlimit.c =================================================================== --- head/usr.bin/procstat/procstat_rlimit.c (revision 296161) +++ head/usr.bin/procstat/procstat_rlimit.c (revision 296162) @@ -1,126 +1,127 @@ /*- * Copyright (c) 2011 Mikolaj Golub * Copyright (c) 2015 Allan Jude * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "procstat.h" static struct { const char *name; const char *suffix; -} rlimit_param[14] = { +} rlimit_param[15] = { {"cputime", "sec"}, {"filesize", "B "}, {"datasize", "B "}, {"stacksize", "B "}, {"coredumpsize", "B "}, {"memoryuse", "B "}, {"memorylocked", "B "}, {"maxprocesses", " "}, {"openfiles", " "}, {"sbsize", "B "}, {"vmemoryuse", "B "}, {"pseudo-terminals", " "}, {"swapuse", "B "}, {"kqueues", " "}, + {"umtxp", " "}, }; -#if RLIM_NLIMITS > 14 +#if RLIM_NLIMITS > 15 #error "Resource limits have grown. Add new entries to rlimit_param[]." #endif static const char * humanize_rlimit(int indx, rlim_t limit) { static char buf[14]; int scale; if (limit == RLIM_INFINITY) return ("infinity "); scale = humanize_number(buf, sizeof(buf) - 1, (int64_t)limit, rlimit_param[indx].suffix, HN_AUTOSCALE | HN_GETSCALE, HN_DECIMAL); (void)humanize_number(buf, sizeof(buf) - 1, (int64_t)limit, rlimit_param[indx].suffix, HN_AUTOSCALE, HN_DECIMAL); /* Pad with one space if there is no suffix prefix. */ if (scale == 0) sprintf(buf + strlen(buf), " "); return (buf); } void procstat_rlimit(struct procstat *prstat, struct kinfo_proc *kipp) { struct rlimit rlimit; int i; if (!hflag) { xo_emit("{T:/%5s %-16s %-16s %16s %16s}\n", "PID", "COMM", "RLIMIT", "SOFT ", "HARD "); } xo_emit("{ek:process_id/%5d}{e:command/%-16s/%s}", kipp->ki_pid, kipp->ki_comm); for (i = 0; i < RLIM_NLIMITS; i++) { if (procstat_getrlimit(prstat, kipp, i, &rlimit) == -1) return; xo_emit("{dk:process_id/%5d} {d:command/%-16s} " "{d:rlimit_param/%-16s} ", kipp->ki_pid, kipp->ki_comm, rlimit_param[i].name); xo_open_container(rlimit_param[i].name); if (rlimit.rlim_cur == RLIM_INFINITY) xo_emit("{e:soft_limit/infinity}"); else xo_emit("{e:soft_limit/%U}", rlimit.rlim_cur); if (rlimit.rlim_max == RLIM_INFINITY) xo_emit("{e:hard_limit/infinity}"); else xo_emit("{e:hard_limit/%U}", rlimit.rlim_max); xo_close_container(rlimit_param[i].name); xo_emit("{d:rlim_cur/%16s} ", humanize_rlimit(i, rlimit.rlim_cur)); xo_emit("{d:rlim_max/%16s}\n", humanize_rlimit(i, rlimit.rlim_max)); } }