Index: sys/kern/kern_rmlock.c =================================================================== --- sys/kern/kern_rmlock.c +++ sys/kern/kern_rmlock.c @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef DDB #include @@ -853,3 +854,231 @@ lc->lc_ddb_show(&rm->rm_wlock_object); } #endif + +/* + * Read-mostly sleepable locks. + * + * These primitives allow both readers and writers to sleep. However, neither + * readers nor writers are tracked and subsequently there is no priority + * propagation. + * + * They are intended to be only used when write-locking is almost never needed + * (e.g., they can guard against unloading a kernel module) while read-locking + * happens all the time. + * + * Concurrent writers take turns taking the lock while going off cpu. If this is + * of concern for your usecase, this is not the right primitive. + * + * Neither rms_rlock nor rms_runlock use fences. Instead compiler barriers are + * inserted to prevert reordering of generated code. Execution ordering is + * provided with the use of an IPI handler. + */ + +void +rms_init(struct rmslock *rms, const char *name) +{ + + rms->writers = 0; + rms->readers = 0; + mtx_init(&rms->mtx, name, NULL, MTX_DEF | MTX_NEW); + rms->readers_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); + rms->readers_influx = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); +} + +void +rms_destroy(struct rmslock *rms) +{ + + MPASS(rms->writers == 0); + MPASS(rms->readers == 0); + mtx_destroy(&rms->mtx); + uma_zfree_pcpu(pcpu_zone_int, rms->readers_pcpu); + uma_zfree_pcpu(pcpu_zone_int, rms->readers_influx); +} + +static void __noinline +rms_rlock_fallback(struct rmslock *rms) +{ + + (*(int *)zpcpu_get(rms->readers_influx)) = 0; + critical_exit(); + + mtx_lock(&rms->mtx); + MPASS((*(int *)zpcpu_get(rms->readers_pcpu)) == 0); + while (rms->writers > 0) + msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0); + (*(int *)zpcpu_get(rms->readers_pcpu))++; + mtx_unlock(&rms->mtx); +} + +void +rms_rlock(struct rmslock *rms) +{ + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); + + critical_enter(); + (*(int *)zpcpu_get(rms->readers_influx)) = 1; + __compiler_membar(); + if (__predict_false(rms->writers > 0)) { + rms_rlock_fallback(rms); + return; + } + __compiler_membar(); + (*(int *)zpcpu_get(rms->readers_pcpu))++; + __compiler_membar(); + (*(int *)zpcpu_get(rms->readers_influx)) = 0; + critical_exit(); +} + +static void __noinline +rms_runlock_fallback(struct rmslock *rms) +{ + + (*(int *)zpcpu_get(rms->readers_influx)) = 0; + critical_exit(); + + mtx_lock(&rms->mtx); + MPASS(*(int *)zpcpu_get(rms->readers_pcpu) == 0); + MPASS(rms->writers > 0); + MPASS(rms->readers > 0); + rms->readers--; + if (rms->readers == 0) + wakeup_one(&rms->writers); + mtx_unlock(&rms->mtx); +} + +void +rms_runlock(struct rmslock *rms) +{ + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); + + critical_enter(); + (*(int *)zpcpu_get(rms->readers_influx)) = 1; + __compiler_membar(); + if (__predict_false(rms->writers > 0)) { + rms_runlock_fallback(rms); + return; + } + __compiler_membar(); + (*(int *)zpcpu_get(rms->readers_pcpu))--; + __compiler_membar(); + (*(int *)zpcpu_get(rms->readers_influx)) = 0; + critical_exit(); +} + +struct rmslock_ipi { + struct rmslock *rms; + int failed; +}; + +static void +rms_wlock_IPI(void *arg) +{ + struct rmslock_ipi *rmsipi; + struct rmslock *rms; + int readers; + + rmsipi = arg; + rms = rmsipi->rms; + + if ((*(int *)zpcpu_get(rms->readers_influx))) { + atomic_add_int(&rmsipi->failed, 1); + return; + } + readers = zpcpu_replace_cpu(rms->readers_pcpu, 0, curcpu); + if (readers != 0) + atomic_add_int(&rms->readers, readers); +} + +static void +rms_wlock_switch(struct rmslock *rms) +{ + struct rmslock_ipi rmsipi; + int *in_op; + int cpu; + + MPASS(rms->readers == 0); + MPASS(rms->writers == 1); + + rmsipi.rms = rms; + + /* + * Publishes rms->writers. rlock and runlock will get this ordered + * via IPI in the worst case. + */ + atomic_thread_fence_rel(); + + /* + * Collect reader counts from all CPUs using an IPI. The handler can + * find itself running while the interrupted CPU was doing either + * rlock or runlock in which case it will fail and denote this fact. + * + * In case of failure we observe everyone not executing there to know + * when to make the next attempt. Note that threads having the var set + * have preemption disabled. Setting of readers_influx only uses + * compiler barriers making these loads unreliable, which is fine -- + * the IPI handler will always see the correct result. + * + * We retry until all counts are collected. Forward progress is + * guaranteed by that fact that the total number of threads which can + * be caught like this is finite and they all are going to block on + * their own. + */ + for (;;) { + rmsipi.failed = 0; + + smp_rendezvous(smp_no_rendezvous_barrier, + rms_wlock_IPI, + smp_no_rendezvous_barrier, + &rmsipi); + + if (rmsipi.failed == 0) + break; + + CPU_FOREACH(cpu) { + in_op = zpcpu_get_cpu(rms->readers_influx, cpu); + while (atomic_load_int(in_op)) + cpu_spinwait(); + } + } +} + +void +rms_wlock(struct rmslock *rms) +{ + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); + + mtx_lock(&rms->mtx); + rms->writers++; + if (rms->writers > 1) { + msleep(&rms->writers, &rms->mtx, PUSER - 1 | PDROP, mtx_name(&rms->mtx), 0); + MPASS(rms->readers == 0); + return; + } + + rms_wlock_switch(rms); + + if (rms->readers > 0) + msleep(&rms->writers, &rms->mtx, PUSER - 1 | PDROP, mtx_name(&rms->mtx), 0); + else + mtx_unlock(&rms->mtx); + MPASS(rms->readers == 0); +} + +void +rms_wunlock(struct rmslock *rms) +{ + + mtx_lock(&rms->mtx); + MPASS(rms->writers >= 1); + MPASS(rms->readers == 0); + rms->writers--; + if (rms->writers > 0) + wakeup_one(&rms->writers); + else + wakeup(&rms->readers); + mtx_unlock(&rms->mtx); +} Index: sys/security/mac/mac_framework.c =================================================================== --- sys/security/mac/mac_framework.c +++ sys/security/mac/mac_framework.c @@ -176,6 +176,7 @@ #ifndef MAC_STATIC static struct rmlock mac_policy_rm; /* Non-sleeping entry points. */ static struct sx mac_policy_sx; /* Sleeping entry points. */ +static struct rmslock mac_policy_rms; #endif struct mac_policy_list_head mac_policy_list; @@ -209,7 +210,7 @@ if (!mac_late) return; - sx_slock(&mac_policy_sx); + rms_rlock(&mac_policy_rms); #endif } @@ -233,7 +234,7 @@ if (!mac_late) return; - sx_sunlock(&mac_policy_sx); + rms_runlock(&mac_policy_rms); #endif } @@ -249,6 +250,7 @@ return; sx_xlock(&mac_policy_sx); + rms_wlock(&mac_policy_rms); rm_wlock(&mac_policy_rm); #endif } @@ -262,6 +264,7 @@ return; rm_wunlock(&mac_policy_rm); + rms_wunlock(&mac_policy_rms); sx_xunlock(&mac_policy_sx); #endif } @@ -294,6 +297,7 @@ rm_init_flags(&mac_policy_rm, "mac_policy_rm", RM_NOWITNESS | RM_RECURSE); sx_init_flags(&mac_policy_sx, "mac_policy_sx", SX_NOWITNESS); + rms_init(&mac_policy_rms, "mac_policy_rms"); #endif } Index: sys/sys/_rmlock.h =================================================================== --- sys/sys/_rmlock.h +++ sys/sys/_rmlock.h @@ -68,4 +68,14 @@ LIST_ENTRY(rm_priotracker) rmp_qentry; }; +#include + +struct rmslock { + struct mtx mtx; + int writers; + int readers; + int *readers_pcpu; + int *readers_influx; +}; + #endif /* !_SYS__RMLOCK_H_ */ Index: sys/sys/rmlock.h =================================================================== --- sys/sys/rmlock.h +++ sys/sys/rmlock.h @@ -133,5 +133,12 @@ #define rm_assert(rm, what) #endif +void rms_init(struct rmslock *rms, const char *name); +void rms_destroy(struct rmslock *rms); +void rms_rlock(struct rmslock *rms); +void rms_runlock(struct rmslock *rms); +void rms_wlock(struct rmslock *rms); +void rms_wunlock(struct rmslock *rms); + #endif /* _KERNEL */ #endif /* !_SYS_RMLOCK_H_ */