diff --git a/contrib/jemalloc/src/pages.c b/contrib/jemalloc/src/pages.c --- a/contrib/jemalloc/src/pages.c +++ b/contrib/jemalloc/src/pages.c @@ -12,6 +12,7 @@ #include #ifdef __FreeBSD__ #include +#include #endif #endif @@ -437,9 +438,14 @@ #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT static bool os_overcommits_sysctl(void) { - int vm_overcommit; + int vm_overcommit, bsdflags; size_t sz; +#ifdef ELF_BSDF_VMNOOVERCOMMIT + if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) == 0) + return ((bsdflags & ELF_BSDF_VMNOOVERCOMMIT) == 0); +#endif + sz = sizeof(vm_overcommit); #if defined(__FreeBSD__) && defined(VM_OVERCOMMIT) int mib[2]; diff --git a/lib/libc/gen/auxv.c b/lib/libc/gen/auxv.c --- a/lib/libc/gen/auxv.c +++ b/lib/libc/gen/auxv.c @@ -73,6 +73,7 @@ static void *ps_strings, *timekeep; static u_long hwcap, hwcap2; static void *fxrng_seed_version; +static u_long usrstackbase, usrstacklim; #ifdef __powerpc__ static int powerpc_new_auxv_format = 0; @@ -144,6 +145,14 @@ case AT_FXRNG: fxrng_seed_version = aux->a_un.a_ptr; break; + + case AT_USRSTACKBASE: + usrstackbase = aux->a_un.a_val; + break; + + case AT_USRSTACKLIM: + usrstacklim = aux->a_un.a_val; + break; #ifdef __powerpc__ /* * Since AT_STACKPROT is always set, and the common @@ -370,6 +379,20 @@ } else res = EINVAL; break; + case AT_USRSTACKBASE: + if (buflen == sizeof(u_long)) { + *(u_long *)buf = usrstackbase; + res = 0; + } else + res = EINVAL; + break; + case AT_USRSTACKLIM: + if (buflen == sizeof(u_long)) { + *(u_long *)buf = usrstacklim; + res = 0; + } else + res = EINVAL; + break; default: res = ENOENT; break; diff --git a/lib/libc/gen/elf_utils.c b/lib/libc/gen/elf_utils.c --- a/lib/libc/gen/elf_utils.c +++ b/lib/libc/gen/elf_utils.c @@ -28,7 +28,8 @@ * $FreeBSD$ */ -#include +#include +#include #include #include #include @@ -77,19 +78,23 @@ { int mib[2]; struct rlimit rlim; - u_long usrstack; + u_long usrstack, stacksz; size_t len; - mib[0] = CTL_KERN; - mib[1] = KERN_USRSTACK; - len = sizeof(usrstack); - if (sysctl(mib, sizeof(mib) / sizeof(mib[0]), &usrstack, &len, NULL, 0) - == -1) - return; - if (getrlimit(RLIMIT_STACK, &rlim) == -1) - return; - mprotect((void *)(uintptr_t)(usrstack - rlim.rlim_cur), - rlim.rlim_cur, _rtld_get_stack_prot()); + if (_elf_aux_info(AT_USRSTACKBASE, &usrstack, sizeof(usrstack)) != 0) { + mib[0] = CTL_KERN; + mib[1] = KERN_USRSTACK; + len = sizeof(usrstack); + if (sysctl(mib, nitems(mib), &usrstack, &len, NULL, 0) == -1) + return; + } + if (_elf_aux_info(AT_USRSTACKLIM, &stacksz, sizeof(stacksz)) != 0) { + if (getrlimit(RLIMIT_STACK, &rlim) == -1) + return; + stacksz = rlim.rlim_cur; + } + mprotect((void *)(uintptr_t)(usrstack - stacksz), stacksz, + _rtld_get_stack_prot()); } #pragma weak __pthread_map_stacks_exec diff --git a/lib/libthr/thread/thr_stack.c b/lib/libthr/thread/thr_stack.c --- a/lib/libthr/thread/thr_stack.c +++ b/lib/libthr/thread/thr_stack.c @@ -30,7 +30,8 @@ #include __FBSDID("$FreeBSD$"); -#include +#include +#include #include #include #include @@ -149,19 +150,26 @@ { int mib[2]; struct rlimit rlim; - u_long usrstack; + u_long usrstack, stacksz; size_t len; - mib[0] = CTL_KERN; - mib[1] = KERN_USRSTACK; - len = sizeof(usrstack); - if (sysctl(mib, sizeof(mib) / sizeof(mib[0]), &usrstack, &len, NULL, 0) - == -1) - return; - if (getrlimit(RLIMIT_STACK, &rlim) == -1) + if (elf_aux_info(AT_USRSTACKBASE, &usrstack, sizeof(usrstack)) != 0) { + mib[0] = CTL_KERN; + mib[1] = KERN_USRSTACK; + len = sizeof(usrstack); + if (sysctl(mib, nitems(mib), &usrstack, &len, NULL, 0) == -1) + return; + } + if (elf_aux_info(AT_USRSTACKLIM, &len, sizeof(len)) != 0 && + getrlimit(RLIMIT_STACK, &rlim) == -1) return; - mprotect((void *)(uintptr_t)(usrstack - rlim.rlim_cur), - rlim.rlim_cur, _rtld_get_stack_prot()); + if (elf_aux_info(AT_USRSTACKLIM, &stacksz, sizeof(stacksz)) != 0) { + if (getrlimit(RLIMIT_STACK, &rlim) == -1) + return; + stacksz = rlim.rlim_cur; + } + mprotect((void *)(uintptr_t)(usrstack - stacksz), stacksz, + _rtld_get_stack_prot()); } void diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -1448,7 +1448,8 @@ Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; Elf_Auxinfo *argarray, *pos; struct vmspace *vmspace; - int error; + rlim_t stacksz; + int error, bsdflags, oc; argarray = pos = malloc(AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); @@ -1489,8 +1490,12 @@ AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap); if (imgp->sysent->sv_hwcap2 != NULL) AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2); - AUXARGS_ENTRY(pos, AT_BSDFLAGS, __elfN(sigfastblock) ? - ELF_BSDF_SIGFASTBLK : 0); + bsdflags = 0; + bsdflags |= __elfN(sigfastblock) ? ELF_BSDF_SIGFASTBLK : 0; + oc = atomic_load_int(&vm_overcommit); + bsdflags |= (oc & (SWAP_RESERVE_FORCE_ON | SWAP_RESERVE_RLIMIT_ON)) != + 0 ? ELF_BSDF_VMNOOVERCOMMIT : 0; + AUXARGS_ENTRY(pos, AT_BSDFLAGS, bsdflags); AUXARGS_ENTRY(pos, AT_ARGC, imgp->args->argc); AUXARGS_ENTRY_PTR(pos, AT_ARGV, imgp->argv); AUXARGS_ENTRY(pos, AT_ENVC, imgp->args->envc); @@ -1506,6 +1511,9 @@ AUXARGS_ENTRY(pos, AT_KPRELOAD, vmspace->vm_shp_base + imgp->sysent->sv_vdso_offset); } + AUXARGS_ENTRY(pos, AT_USRSTACKBASE, round_page(vmspace->vm_stacktop)); + stacksz = imgp->proc->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur; + AUXARGS_ENTRY(pos, AT_USRSTACKLIM, stacksz); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); diff --git a/sys/sys/elf_common.h b/sys/sys/elf_common.h --- a/sys/sys/elf_common.h +++ b/sys/sys/elf_common.h @@ -986,8 +986,10 @@ #define AT_PS_STRINGS 32 /* struct ps_strings */ #define AT_FXRNG 33 /* Pointer to root RNG seed version. */ #define AT_KPRELOAD 34 /* Base of vdso, preloaded by rtld */ +#define AT_USRSTACKBASE 35 +#define AT_USRSTACKLIM 36 -#define AT_COUNT 35 /* Count of defined aux entry types. */ +#define AT_COUNT 37 /* Count of defined aux entry types. */ /* * Relocation types. @@ -1501,5 +1503,6 @@ #define R_X86_64_REX_GOTPCRELX 42 #define ELF_BSDF_SIGFASTBLK 0x0001 /* Kernel supports fast sigblock */ +#define ELF_BSDF_VMNOOVERCOMMIT 0x0002 #endif /* !_SYS_ELF_COMMON_H_ */ diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -169,8 +169,8 @@ &swap_total, 0, sysctl_page_shift, "QU", "Total amount of available swap storage."); -static int overcommit = 0; -SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, +int vm_overcommit __read_mostly = 0; +SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &vm_overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; @@ -190,11 +190,6 @@ CTLFLAG_RD, &swap_free_completed, "Number of deferred frees completed"); -/* bits from overcommit */ -#define SWAP_RESERVE_FORCE_ON (1 << 0) -#define SWAP_RESERVE_RLIMIT_ON (1 << 1) -#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) - static int sysctl_page_shift(SYSCTL_HANDLER_ARGS) { @@ -218,7 +213,8 @@ prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT) != 0) { prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); - KASSERT(prev >= pincr, ("negative vmsize for uid = %d\n", uip->ui_uid)); + KASSERT(prev >= pincr, + ("negative vmsize for uid %d\n", uip->ui_uid)); return (false); } return (true); @@ -236,7 +232,8 @@ #ifdef INVARIANTS prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); - KASSERT(prev >= pdecr, ("negative vmsize for uid = %d\n", uip->ui_uid)); + KASSERT(prev >= pdecr, + ("negative vmsize for uid %d\n", uip->ui_uid)); #else atomic_subtract_long(&uip->ui_vmsize, pdecr); #endif @@ -269,8 +266,8 @@ static int curfail; static struct timeval lastfail; - KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, - (uintmax_t)incr)); + KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", + __func__, (uintmax_t)incr)); #ifdef RACCT if (RACCT_ENABLED()) { @@ -286,7 +283,7 @@ prev = atomic_fetchadd_long(&swap_reserved, pincr); r = prev + pincr; s = swap_total; - oc = atomic_load_int(&overcommit); + oc = atomic_load_int(&vm_overcommit); if (r > s && (oc & SWAP_RESERVE_ALLOW_NONWIRED) != 0) { s += vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_wire_count(); @@ -294,13 +291,15 @@ if ((oc & SWAP_RESERVE_FORCE_ON) != 0 && r > s && priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) != 0) { prev = atomic_fetchadd_long(&swap_reserved, -pincr); - KASSERT(prev >= pincr, ("swap_reserved < incr on overcommit fail")); + KASSERT(prev >= pincr, + ("swap_reserved < incr on overcommit fail")); goto out_error; } if (!swap_reserve_by_cred_rlimit(pincr, cred, oc)) { prev = atomic_fetchadd_long(&swap_reserved, -pincr); - KASSERT(prev >= pincr, ("swap_reserved < incr on overcommit fail")); + KASSERT(prev >= pincr, + ("swap_reserved < incr on overcommit fail")); goto out_error; } @@ -308,7 +307,8 @@ out_error: if (ppsratecheck(&lastfail, &curfail, 1)) { - printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", + printf("uid %d, pid %d: swap reservation " + "for %jd bytes failed\n", cred->cr_ruidinfo->ui_uid, curproc->p_pid, incr); } #ifdef RACCT @@ -327,8 +327,8 @@ { u_long pincr; - KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, - (uintmax_t)incr)); + KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", + __func__, (uintmax_t)incr)); #ifdef RACCT if (RACCT_ENABLED()) { @@ -361,8 +361,8 @@ u_long prev; #endif - KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__, - (uintmax_t)decr)); + KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", + __func__, (uintmax_t)decr)); pdecr = atop(decr); #ifdef INVARIANTS diff --git a/sys/vm/vm.h b/sys/vm/vm.h --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -165,6 +165,12 @@ extern int vm_ndomains; +/* bits from overcommit */ +#define SWAP_RESERVE_FORCE_ON (1 << 0) +#define SWAP_RESERVE_RLIMIT_ON (1 << 1) +#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) +extern int vm_overcommit; + #ifdef _KERNEL struct ucred; bool swap_reserve(vm_ooffset_t incr);