Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c +++ head/sys/kern/init_main.c @@ -541,7 +541,10 @@ /* End hack. creds get properly set later with thread_cow_get_proc */ curthread->td_ucred = NULL; newcred->cr_prison = &prison0; + newcred->cr_users++; /* avoid assertion failure */ proc_set_cred_init(p, newcred); + newcred->cr_users--; + crfree(newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif @@ -810,8 +813,9 @@ #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); - crfree(td->td_ucred); - td->td_ucred = crhold(initproc->p_ucred); + crcowfree(td); + td->td_realucred = crcowget(initproc->p_ucred); + td->td_ucred = td->td_realucred; PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); Index: head/sys/kern/kern_fork.c =================================================================== --- head/sys/kern/kern_fork.c +++ head/sys/kern/kern_fork.c @@ -961,7 +961,7 @@ * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ - proc_set_cred_init(newproc, crhold(td->td_ucred)); + proc_set_cred_init(newproc, td->td_ucred); /* * Initialize resource accounting for the child process. @@ -998,8 +998,7 @@ #endif racct_proc_exit(newproc); fail1: - crfree(newproc->p_ucred); - newproc->p_ucred = NULL; + proc_unset_cred(newproc); fail2: if (vm2 != NULL) vmspace_free(vm2); Index: head/sys/kern/kern_prot.c =================================================================== --- head/sys/kern/kern_prot.c +++ head/sys/kern/kern_prot.c @@ -1841,6 +1841,98 @@ } /* + * Credential management. + * + * struct ucred objects are rarely allocated but gain and lose references all + * the time (e.g., on struct file alloc/dealloc) turning refcount updates into + * a significant source of cache-line ping ponging. Common cases are worked + * around by modifying thread-local counter instead if the cred to operate on + * matches td_realucred. + * + * The counter is split into 2 parts: + * - cr_users -- total count of all struct proc and struct thread objects + * which have given cred in p_ucred and td_ucred respectively + * - cr_ref -- the actual ref count, only valid if cr_users == 0 + * + * If users == 0 then cr_ref behaves similarly to refcount(9), in particular if + * the count reaches 0 the object is freeable. + * If users > 0 and curthread->td_realucred == cred, then updates are performed + * against td_ucredref. + * In other cases updates are performed against cr_ref. + * + * Changing td_realucred into something else decrements cr_users and transfers + * accumulated updates. + */ +struct ucred * +crcowget(struct ucred *cr) +{ + + mtx_lock(&cr->cr_mtx); + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + cr->cr_users++; + cr->cr_ref++; + mtx_unlock(&cr->cr_mtx); + return (cr); +} + +static struct ucred * +crunuse(struct thread *td) +{ + struct ucred *cr, *crold; + + cr = td->td_ucred; + mtx_lock(&cr->cr_mtx); + cr->cr_ref += td->td_ucredref; + td->td_ucredref = 0; + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + cr->cr_users--; + if (cr->cr_users == 0) { + KASSERT(cr->cr_ref > 0, ("%s: ref %d not > 0 on cred %p", + __func__, cr->cr_ref, cr)); + crold = cr; + } else { + cr->cr_ref--; + crold = NULL; + } + mtx_unlock(&cr->cr_mtx); + return (crold); +} + +void +crcowfree(struct thread *td) +{ + struct ucred *cr; + + cr = crunuse(td); + if (cr != NULL) + crfree(cr); +} + +struct ucred * +crcowsync(void) +{ + struct thread *td; + struct proc *p; + struct ucred *crnew, *crold; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + + MPASS(td->td_realucred == td->td_ucred); + if (td->td_realucred == p->p_ucred) + return (NULL); + + crnew = crcowget(p->p_ucred); + crold = crunuse(td); + td->td_realucred = crnew; + td->td_ucred = td->td_realucred; + return (crold); +} + +/* * Allocate a zeroed cred structure. */ struct ucred * @@ -1849,7 +1941,8 @@ struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); - refcount_init(&cr->cr_ref, 1); + mtx_init(&cr->cr_mtx, "cred", NULL, MTX_DEF); + cr->cr_ref = 1; #ifdef AUDIT audit_cred_init(cr); #endif @@ -1868,8 +1961,18 @@ struct ucred * crhold(struct ucred *cr) { + struct thread *td; - refcount_acquire(&cr->cr_ref); + td = curthread; + if (__predict_true(td->td_realucred == cr)) { + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + td->td_ucredref++; + return (cr); + } + mtx_lock(&cr->cr_mtx); + cr->cr_ref++; + mtx_unlock(&cr->cr_mtx); return (cr); } @@ -1879,36 +1982,51 @@ void crfree(struct ucred *cr) { + struct thread *td; - KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); - KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred")); - if (refcount_release(&cr->cr_ref)) { - /* - * Some callers of crget(), such as nfs_statfs(), - * allocate a temporary credential, but don't - * allocate a uidinfo structure. - */ - if (cr->cr_uidinfo != NULL) - uifree(cr->cr_uidinfo); - if (cr->cr_ruidinfo != NULL) - uifree(cr->cr_ruidinfo); - /* - * Free a prison, if any. - */ - if (cr->cr_prison != NULL) - prison_free(cr->cr_prison); - if (cr->cr_loginclass != NULL) - loginclass_free(cr->cr_loginclass); + td = curthread; + if (td->td_realucred == cr) { + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + td->td_ucredref--; + return; + } + mtx_lock(&cr->cr_mtx); + KASSERT(cr->cr_users >= 0, ("%s: users %d not >= 0 on cred %p", + __func__, cr->cr_users, cr)); + cr->cr_ref--; + if (cr->cr_users > 0) { + mtx_unlock(&cr->cr_mtx); + return; + } + KASSERT(cr->cr_ref >= 0, ("%s: ref %d not >= 0 on cred %p", + __func__, cr->cr_ref, cr)); + if (cr->cr_ref > 0) { + mtx_unlock(&cr->cr_mtx); + return; + } + /* + * Some callers of crget(), such as nfs_statfs(), allocate a temporary + * credential, but don't allocate a uidinfo structure. + */ + if (cr->cr_uidinfo != NULL) + uifree(cr->cr_uidinfo); + if (cr->cr_ruidinfo != NULL) + uifree(cr->cr_ruidinfo); + if (cr->cr_prison != NULL) + prison_free(cr->cr_prison); + if (cr->cr_loginclass != NULL) + loginclass_free(cr->cr_loginclass); #ifdef AUDIT - audit_cred_destroy(cr); + audit_cred_destroy(cr); #endif #ifdef MAC - mac_cred_destroy(cr); + mac_cred_destroy(cr); #endif - if (cr->cr_groups != cr->cr_smallgroups) - free(cr->cr_groups, M_CRED); - free(cr, M_CRED); - } + mtx_destroy(&cr->cr_mtx); + if (cr->cr_groups != cr->cr_smallgroups) + free(cr->cr_groups, M_CRED); + free(cr, M_CRED); } /* @@ -1982,7 +2100,7 @@ proc_set_cred_init(struct proc *p, struct ucred *newcred) { - p->p_ucred = newcred; + p->p_ucred = crcowget(newcred); } /* @@ -1998,10 +2116,20 @@ void proc_set_cred(struct proc *p, struct ucred *newcred) { + struct ucred *cr; - MPASS(p->p_ucred != NULL); + cr = p->p_ucred; + MPASS(cr != NULL); PROC_LOCK_ASSERT(p, MA_OWNED); + KASSERT(newcred->cr_users == 0, ("%s: users %d not 0 on cred %p", + __func__, newcred->cr_users, newcred)); + mtx_lock(&cr->cr_mtx); + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + cr->cr_users--; + mtx_unlock(&cr->cr_mtx); p->p_ucred = newcred; + newcred->cr_users = 1; PROC_UPDATE_COW(p); } @@ -2010,9 +2138,17 @@ { struct ucred *cr; - MPASS(p->p_state == PRS_ZOMBIE); + MPASS(p->p_state == PRS_ZOMBIE || p->p_state == PRS_NEW); cr = p->p_ucred; p->p_ucred = NULL; + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + mtx_lock(&cr->cr_mtx); + cr->cr_users--; + if (cr->cr_users == 0) + KASSERT(cr->cr_ref > 0, ("%s: ref %d not > 0 on cred %p", + __func__, cr->cr_ref, cr)); + mtx_unlock(&cr->cr_mtx); crfree(cr); } Index: head/sys/kern/kern_thread.c =================================================================== --- head/sys/kern/kern_thread.c +++ head/sys/kern/kern_thread.c @@ -82,9 +82,9 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0x104, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x498, +_Static_assert(offsetof(struct thread, td_frame) == 0x4a8, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x6a0, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); @@ -102,9 +102,9 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa0, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x2fc, +_Static_assert(offsetof(struct thread, td_frame) == 0x304, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x340, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x348, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); @@ -463,7 +463,8 @@ { PROC_LOCK_ASSERT(p, MA_OWNED); - newtd->td_ucred = crhold(p->p_ucred); + newtd->td_realucred = crcowget(p->p_ucred); + newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } @@ -472,7 +473,9 @@ thread_cow_get(struct thread *newtd, struct thread *td) { - newtd->td_ucred = crhold(td->td_ucred); + MPASS(td->td_realucred == td->td_ucred); + newtd->td_realucred = crcowget(td->td_realucred); + newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } @@ -481,8 +484,8 @@ thread_cow_free(struct thread *td) { - if (td->td_ucred != NULL) - crfree(td->td_ucred); + if (td->td_realucred != NULL) + crcowfree(td); if (td->td_limit != NULL) lim_free(td->td_limit); } @@ -495,13 +498,9 @@ struct plimit *oldlimit; p = td->td_proc; - oldcred = NULL; oldlimit = NULL; PROC_LOCK(p); - if (td->td_ucred != p->p_ucred) { - oldcred = td->td_ucred; - td->td_ucred = crhold(p->p_ucred); - } + oldcred = crcowsync(); if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h +++ head/sys/sys/proc.h @@ -267,7 +267,8 @@ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ - struct ucred *td_ucred; /* (k) Reference to credentials. */ + struct ucred *td_realucred; /* (k) Reference to credentials. */ + struct ucred *td_ucred; /* (k) Used credentials, temporarily switchable. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ @@ -305,6 +306,7 @@ int td_errno; /* (k) Error from last syscall. */ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */ + u_int td_ucredref; /* (k) references on td_realucred */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ Index: head/sys/sys/ucred.h =================================================================== --- head/sys/sys/ucred.h +++ head/sys/sys/ucred.h @@ -35,6 +35,10 @@ #ifndef _SYS_UCRED_H_ #define _SYS_UCRED_H_ +#if defined(_KERNEL) || defined(_WANT_UCRED) +#include +#include +#endif #include struct loginclass; @@ -46,10 +50,19 @@ * * Please do not inspect cr_uid directly to determine superuserness. The * priv(9) interface should be used to check for privilege. + * + * Lock reference: + * c - cr_mtx + * + * Unmarked fields are constant after creation. + * + * See "Credential management" comment in kern_prot.c for more information. */ #if defined(_KERNEL) || defined(_WANT_UCRED) struct ucred { - u_int cr_ref; /* reference count */ + struct mtx cr_mtx; + u_int cr_ref; /* (c) reference count */ + u_int cr_users; /* (c) proc + thread using this cred */ #define cr_startcopy cr_uid uid_t cr_uid; /* effective user id */ uid_t cr_ruid; /* real user id */ @@ -115,8 +128,11 @@ void proc_set_cred(struct proc *p, struct ucred *cr); void proc_unset_cred(struct proc *p); void crfree(struct ucred *cr); +struct ucred *crcowsync(void); struct ucred *crget(void); struct ucred *crhold(struct ucred *cr); +struct ucred *crcowget(struct ucred *cr); +void crcowfree(struct thread *td); void cru2x(struct ucred *cr, struct xucred *xcr); void cru2xt(struct thread *td, struct xucred *xcr); void crsetgroups(struct ucred *cr, int n, gid_t *groups);