Index: sys/kern/init_main.c =================================================================== --- sys/kern/init_main.c +++ sys/kern/init_main.c @@ -541,7 +541,10 @@ /* End hack. creds get properly set later with thread_cow_get_proc */ curthread->td_ucred = NULL; newcred->cr_prison = &prison0; + newcred->cr_users++; /* avoid assertion failure */ proc_set_cred_init(p, newcred); + newcred->cr_users--; + crfree(newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif @@ -810,8 +813,9 @@ #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); - crfree(td->td_ucred); - td->td_ucred = crhold(initproc->p_ucred); + crcowfree(td); + td->td_realucred = crcowget(initproc->p_ucred); + td->td_ucred = td->td_realucred; PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); Index: sys/kern/kern_exit.c =================================================================== --- sys/kern/kern_exit.c +++ sys/kern/kern_exit.c @@ -952,8 +952,7 @@ /* * Free credentials, arguments, and sigacts. */ - crfree(p->p_ucred); - proc_set_cred(p, NULL); + proc_unset_cred(p); pargs_drop(p->p_args); p->p_args = NULL; sigacts_free(p->p_sigacts); Index: sys/kern/kern_fork.c =================================================================== --- sys/kern/kern_fork.c +++ sys/kern/kern_fork.c @@ -969,7 +969,7 @@ * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ - proc_set_cred_init(newproc, crhold(td->td_ucred)); + proc_set_cred_init(newproc, td->td_ucred); /* * Initialize resource accounting for the child process. Index: sys/kern/kern_prot.c =================================================================== --- sys/kern/kern_prot.c +++ sys/kern/kern_prot.c @@ -1841,6 +1841,98 @@ return (0); } +/* + * Credential management. + * + * struct ucred objects are rarely allocated but gain and lose references all + * the time (e.g., on struct file alloc/dealloc) turning refcount updates into + * a significant source of cache-line ping ponging. The problem is largely + * worked around by modifying thread-local counter instead if the cred to + * operate on matches td_realucred. + * + * The counter is split into 2 parts: + * - cr_users -- total count of all struct proc and struct thread objects + * which have given cred in p_ucred and td_ucred respectively + * - cr_ref -- the actual ref count + * + * If users == 0 then cr_ref behaves similarly to refcount(9), in particular if + * the count reaches 0 the object is freeable. + * If users > 0 and curthread->td_realucred == cred, then updates are performed + * against td_ucredref. + * Changing td_realucred into something else decrements cr_users and transfers + * accumulated updates. Should the new cr_users count be 0, the resulting + * cr_ref represents total count of all references. Otherwise cr_ref remains + * meaningless in isolation. + */ +struct ucred * +crcowget(struct ucred *cr) +{ + + mtx_lock(&cr->cr_mtx); + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + cr->cr_users++; + cr->cr_ref++; + mtx_unlock(&cr->cr_mtx); + return (cr); +} + +static struct ucred * +crunuse(struct thread *td) +{ + struct ucred *cr, *crold; + + cr = td->td_ucred; + mtx_lock(&cr->cr_mtx); + cr->cr_ref += td->td_ucredref; + td->td_ucredref = 0; + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + cr->cr_users--; + if (cr->cr_users == 0) { + KASSERT(cr->cr_ref > 0, ("%s: ref %d not > 0 on cred %p", + __func__, cr->cr_ref, cr)); + crold = cr; + } else { + cr->cr_ref--; + crold = NULL; + } + mtx_unlock(&cr->cr_mtx); + return (crold); +} + +void +crcowfree(struct thread *td) +{ + struct ucred *cr; + + cr = crunuse(td); + if (cr != NULL) + crfree(cr); +} + +struct ucred * +crcowsync(void) +{ + struct thread *td; + struct proc *p; + struct ucred *crnew, *crold; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + + MPASS(td->td_realucred == td->td_ucred); + if (td->td_realucred == p->p_ucred) + return (NULL); + + crnew = crcowget(p->p_ucred); + crold = crunuse(td); + td->td_realucred = crnew; + td->td_ucred = td->td_realucred; + return (crold); +} + /* * Allocate a zeroed cred structure. */ @@ -1850,7 +1942,8 @@ struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); - refcount_init(&cr->cr_ref, 1); + mtx_init(&cr->cr_mtx, "cred", NULL, MTX_DEF); + cr->cr_ref = 1; #ifdef AUDIT audit_cred_init(cr); #endif @@ -1869,8 +1962,18 @@ struct ucred * crhold(struct ucred *cr) { + struct thread *td; - refcount_acquire(&cr->cr_ref); + td = curthread; + if (__predict_true(td->td_realucred == cr)) { + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + td->td_ucredref++; + return (cr); + } + mtx_lock(&cr->cr_mtx); + cr->cr_ref++; + mtx_unlock(&cr->cr_mtx); return (cr); } @@ -1880,36 +1983,51 @@ void crfree(struct ucred *cr) { + struct thread *td; - KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); - KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred")); - if (refcount_release(&cr->cr_ref)) { - /* - * Some callers of crget(), such as nfs_statfs(), - * allocate a temporary credential, but don't - * allocate a uidinfo structure. - */ - if (cr->cr_uidinfo != NULL) - uifree(cr->cr_uidinfo); - if (cr->cr_ruidinfo != NULL) - uifree(cr->cr_ruidinfo); - /* - * Free a prison, if any. - */ - if (cr->cr_prison != NULL) - prison_free(cr->cr_prison); - if (cr->cr_loginclass != NULL) - loginclass_free(cr->cr_loginclass); + td = curthread; + if (td->td_realucred == cr) { + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + td->td_ucredref--; + return; + } + mtx_lock(&cr->cr_mtx); + KASSERT(cr->cr_users >= 0, ("%s: users %d not >= 0 on cred %p", + __func__, cr->cr_users, cr)); + cr->cr_ref--; + if (cr->cr_users > 0) { + mtx_unlock(&cr->cr_mtx); + return; + } + KASSERT(cr->cr_ref >= 0, ("%s: ref %d not >= 0 on cred %p", + __func__, cr->cr_ref, cr)); + if (cr->cr_ref > 0) { + mtx_unlock(&cr->cr_mtx); + return; + } + /* + * Some callers of crget(), such as nfs_statfs(), allocate a temporary + * credential, but don't allocate a uidinfo structure. + */ + if (cr->cr_uidinfo != NULL) + uifree(cr->cr_uidinfo); + if (cr->cr_ruidinfo != NULL) + uifree(cr->cr_ruidinfo); + if (cr->cr_prison != NULL) + prison_free(cr->cr_prison); + if (cr->cr_loginclass != NULL) + loginclass_free(cr->cr_loginclass); #ifdef AUDIT - audit_cred_destroy(cr); + audit_cred_destroy(cr); #endif #ifdef MAC - mac_cred_destroy(cr); + mac_cred_destroy(cr); #endif - if (cr->cr_groups != cr->cr_smallgroups) - free(cr->cr_groups, M_CRED); - free(cr, M_CRED); - } + mtx_destroy(&cr->cr_mtx); + if (cr->cr_groups != cr->cr_smallgroups) + free(cr->cr_groups, M_CRED); + free(cr, M_CRED); } /* @@ -1983,7 +2101,7 @@ proc_set_cred_init(struct proc *p, struct ucred *newcred) { - p->p_ucred = newcred; + p->p_ucred = crcowget(newcred); } /* @@ -1999,16 +2117,39 @@ void proc_set_cred(struct proc *p, struct ucred *newcred) { + struct ucred *cr; - MPASS(p->p_ucred != NULL); - if (newcred == NULL) - MPASS(p->p_state == PRS_ZOMBIE); - else - PROC_LOCK_ASSERT(p, MA_OWNED); - + cr = p->p_ucred; + MPASS(cr != NULL); + PROC_LOCK_ASSERT(p, MA_OWNED); + KASSERT(newcred->cr_users == 0, ("%s: users %d not 0 on cred %p", + __func__, newcred->cr_users, newcred)); + mtx_lock(&cr->cr_mtx); + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + cr->cr_users--; + mtx_unlock(&cr->cr_mtx); p->p_ucred = newcred; - if (newcred != NULL) - PROC_UPDATE_COW(p); + newcred->cr_users = 1; + PROC_UPDATE_COW(p); +} + +void +proc_unset_cred(struct proc *p) +{ + struct ucred *cr; + + MPASS(p->p_state == PRS_ZOMBIE); + cr = p->p_ucred; + KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", + __func__, cr->cr_users, cr)); + mtx_lock(&cr->cr_mtx); + cr->cr_users--; + if (cr->cr_users == 0) + KASSERT(cr->cr_ref > 0, ("%s: ref %d not > 0 on cred %p", + __func__, cr->cr_ref, cr)); + mtx_unlock(&cr->cr_mtx); + crfree(cr); } struct ucred * Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -465,7 +465,8 @@ { PROC_LOCK_ASSERT(p, MA_OWNED); - newtd->td_ucred = crhold(p->p_ucred); + newtd->td_realucred = crcowget(p->p_ucred); + newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } @@ -474,7 +475,9 @@ thread_cow_get(struct thread *newtd, struct thread *td) { - newtd->td_ucred = crhold(td->td_ucred); + MPASS(td->td_realucred == td->td_ucred); + newtd->td_realucred = crcowget(td->td_realucred); + newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } @@ -483,8 +486,8 @@ thread_cow_free(struct thread *td) { - if (td->td_ucred != NULL) - crfree(td->td_ucred); + if (td->td_realucred != NULL) + crcowfree(td); if (td->td_limit != NULL) lim_free(td->td_limit); } @@ -497,13 +500,9 @@ struct plimit *oldlimit; p = td->td_proc; - oldcred = NULL; oldlimit = NULL; PROC_LOCK(p); - if (td->td_ucred != p->p_ucred) { - oldcred = td->td_ucred; - td->td_ucred = crhold(p->p_ucred); - } + oldcred = crcowsync(); if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -268,7 +268,8 @@ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ - struct ucred *td_ucred; /* (k) Reference to credentials. */ + struct ucred *td_realucred; /* (k) Reference to credentials. */ + struct ucred *td_ucred; /* (k) Used credentials, temporarily switchable. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ @@ -306,6 +307,7 @@ int td_errno; /* (k) Error from last syscall. */ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */ + u_int td_ucredref; /* (k) references on td_realucred */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ Index: sys/sys/ucred.h =================================================================== --- sys/sys/ucred.h +++ sys/sys/ucred.h @@ -35,6 +35,8 @@ #ifndef _SYS_UCRED_H_ #define _SYS_UCRED_H_ +#include +#include #include struct loginclass; @@ -49,7 +51,9 @@ */ #if defined(_KERNEL) || defined(_WANT_UCRED) struct ucred { + struct mtx cr_mtx; u_int cr_ref; /* reference count */ + u_int cr_users; /* proc + thread using this cred */ #define cr_startcopy cr_uid uid_t cr_uid; /* effective user id */ uid_t cr_ruid; /* real user id */ @@ -113,9 +117,13 @@ void crextend(struct ucred *cr, int n); void proc_set_cred_init(struct proc *p, struct ucred *cr); void proc_set_cred(struct proc *p, struct ucred *cr); +void proc_unset_cred(struct proc *p); void crfree(struct ucred *cr); +struct ucred *crcowsync(void); struct ucred *crget(void); struct ucred *crhold(struct ucred *cr); +struct ucred *crcowget(struct ucred *cr); +void crcowfree(struct thread *td); void cru2x(struct ucred *cr, struct xucred *xcr); void cru2xt(struct thread *td, struct xucred *xcr); void crsetgroups(struct ucred *cr, int n, gid_t *groups);