Index: head/share/man/man9/rmlock.9 =================================================================== --- head/share/man/man9/rmlock.9 +++ head/share/man/man9/rmlock.9 @@ -26,7 +26,7 @@ .\" $FreeBSD$ .\" .\" Based on rwlock.9 man page -.Dd November 11, 2017 +.Dd December 27, 2019 .Dt RMLOCK 9 .Os .Sh NAME @@ -43,7 +43,13 @@ .Nm rm_sleep , .Nm rm_assert , .Nm RM_SYSINIT , -.Nm RM_SYSINIT_FLAGS +.Nm RM_SYSINIT_FLAGS , +.Nm rms_init , +.Nm rms_destroy , +.Nm rms_rlock , +.Nm rms_wlock , +.Nm rms_runlock , +.Nm rms_wunlock .Nd kernel reader/writer lock optimized for read-mostly access patterns .Sh SYNOPSIS .In sys/param.h @@ -77,6 +83,18 @@ .In sys/kernel.h .Fn RM_SYSINIT "name" "struct rmlock *rm" "const char *desc" .Fn RM_SYSINIT_FLAGS "name" "struct rmlock *rm" "const char *desc" "int flags" +.Ft void +.Fn rms_init "struct rmslock *rms" "const char *name" +.Ft void +.Fn rms_destroy "struct rmslock *rms" +.Ft void +.Fn rms_rlock "struct rmslock *rms" +.Ft void +.Fn rms_wlock "struct rmslock *rms" +.Ft void +.Fn rms_runlock "struct rmslock *rms" +.Ft void +.Fn rms_wunlock "struct rmslock *rms" .Sh DESCRIPTION Read-mostly locks allow shared access to protected data by multiple threads, or exclusive access by a single thread. @@ -113,22 +131,22 @@ option; however, writers are never allowed to recurse. .Pp -Sleepable read-mostly locks are created by passing +Sleeping for writers can be allowed by passing .Dv RM_SLEEPABLE to .Fn rm_init_flags . -Unlike normal read-mostly locks, -sleepable read-mostly locks follow the same lock ordering rules as +It changes lock ordering rules to the same as for .Xr sx 9 locks. -Sleepable read-mostly locks do not propagate priority to writers, -but they do propagate priority to readers. -Writers are permitted to sleep while holding a read-mostly lock, -but readers are not. -Unlike other sleepable locks such as +They do not propagate priority to writers, but they do propagate priority to +readers. Note that readers are not permitted to sleep regardless of the flag. +.Pp +Sleepable read-mostly locks (created with +.Fn rms_init ) +allow sleeping for both readers and writers, but don't do priority propagation +for either. They follow .Xr sx 9 -locks, -readers must use try operations on other sleepable locks to avoid sleeping. +lock ordering. .Ss Macros and Functions .Bl -tag -width indent .It Fn rm_init "struct rmlock *rm" "const char *name" @@ -286,6 +304,43 @@ .Fa rm . .El .El +.Bl -tag -width indent +.It Fn rms_init "struct rmslock *rms" "const char *name" +Initialize the sleepable read-mostly lock +.Fa rms . +The +.Fa name +description is used as +.Fa wmesg +parameter to the +.Xr msleep 9 +routine. +This function must be called before any other operations on the lock. +.It Fn rms_rlock "struct rmlock *rm" +Lock +.Fa rms +as a reader. +If any thread holds this lock exclusively, the current thread blocks. +.It Fn rms_wlock "struct rmslock *rms" +Lock +.Fa rms +as a writer. +If the lock is already taken, the current thread blocks. +The +.Fn rms_wlock +function cannot be called recursively. +.It Fn rms_runlock "struct rmslock *rms" +This function releases a shared lock previously acquired by +.Fn rms_rlock . +.It Fn rms_wunlock "struct rmslock *rms" +This function releases an exclusive lock previously acquired by +.Fn rms_wlock . +.It Fn rms_destroy "struct rmslock *rms" +This functions destroys a lock previously initialized with +.Fn rms_init . +The +.Fa rms +lock must be unlocked. .Sh SEE ALSO .Xr locking 9 , .Xr mutex 9 , Index: head/sys/kern/kern_rmlock.c =================================================================== --- head/sys/kern/kern_rmlock.c +++ head/sys/kern/kern_rmlock.c @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef DDB #include @@ -853,3 +854,241 @@ lc->lc_ddb_show(&rm->rm_wlock_object); } #endif + +/* + * Read-mostly sleepable locks. + * + * These primitives allow both readers and writers to sleep. However, neither + * readers nor writers are tracked and subsequently there is no priority + * propagation. + * + * They are intended to be only used when write-locking is almost never needed + * (e.g., they can guard against unloading a kernel module) while read-locking + * happens all the time. + * + * Concurrent writers take turns taking the lock while going off cpu. If this is + * of concern for your usecase, this is not the right primitive. + * + * Neither rms_rlock nor rms_runlock use fences. Instead compiler barriers are + * inserted to prevert reordering of generated code. Execution ordering is + * provided with the use of an IPI handler. + */ + +void +rms_init(struct rmslock *rms, const char *name) +{ + + rms->writers = 0; + rms->readers = 0; + mtx_init(&rms->mtx, name, NULL, MTX_DEF | MTX_NEW); + rms->readers_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); + rms->readers_influx = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); +} + +void +rms_destroy(struct rmslock *rms) +{ + + MPASS(rms->writers == 0); + MPASS(rms->readers == 0); + mtx_destroy(&rms->mtx); + uma_zfree_pcpu(pcpu_zone_int, rms->readers_pcpu); + uma_zfree_pcpu(pcpu_zone_int, rms->readers_influx); +} + +static void __noinline +rms_rlock_fallback(struct rmslock *rms) +{ + + (*zpcpu_get(rms->readers_influx)) = 0; + critical_exit(); + + mtx_lock(&rms->mtx); + MPASS(*zpcpu_get(rms->readers_pcpu) == 0); + while (rms->writers > 0) + msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0); + (*zpcpu_get(rms->readers_pcpu))++; + mtx_unlock(&rms->mtx); +} + +void +rms_rlock(struct rmslock *rms) +{ + int *influx; + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); + + critical_enter(); + influx = zpcpu_get(rms->readers_influx); + __compiler_membar(); + *influx = 1; + __compiler_membar(); + if (__predict_false(rms->writers > 0)) { + rms_rlock_fallback(rms); + return; + } + __compiler_membar(); + (*zpcpu_get(rms->readers_pcpu))++; + __compiler_membar(); + *influx = 0; + critical_exit(); +} + +static void __noinline +rms_runlock_fallback(struct rmslock *rms) +{ + + (*zpcpu_get(rms->readers_influx)) = 0; + critical_exit(); + + mtx_lock(&rms->mtx); + MPASS(*zpcpu_get(rms->readers_pcpu) == 0); + MPASS(rms->writers > 0); + MPASS(rms->readers > 0); + rms->readers--; + if (rms->readers == 0) + wakeup_one(&rms->writers); + mtx_unlock(&rms->mtx); +} + +void +rms_runlock(struct rmslock *rms) +{ + int *influx; + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); + + critical_enter(); + influx = zpcpu_get(rms->readers_influx); + __compiler_membar(); + *influx = 1; + __compiler_membar(); + if (__predict_false(rms->writers > 0)) { + rms_runlock_fallback(rms); + return; + } + __compiler_membar(); + (*zpcpu_get(rms->readers_pcpu))--; + __compiler_membar(); + *influx = 0; + critical_exit(); +} + +struct rmslock_ipi { + struct rmslock *rms; + cpuset_t signal; +}; + +static void +rms_wlock_IPI(void *arg) +{ + struct rmslock_ipi *rmsipi; + struct rmslock *rms; + int readers; + + rmsipi = arg; + rms = rmsipi->rms; + + if (*zpcpu_get(rms->readers_influx)) + return; + readers = zpcpu_replace(rms->readers_pcpu, 0); + if (readers != 0) + atomic_add_int(&rms->readers, readers); + CPU_CLR_ATOMIC(curcpu, &rmsipi->signal); +} + +static void +rms_wlock_switch(struct rmslock *rms) +{ + struct rmslock_ipi rmsipi; + int *in_op; + int cpu; + + MPASS(rms->readers == 0); + MPASS(rms->writers == 1); + + rmsipi.rms = rms; + + /* + * Publishes rms->writers. rlock and runlock will get this ordered + * via IPI in the worst case. + */ + atomic_thread_fence_rel(); + + /* + * Collect reader counts from all CPUs using an IPI. The handler can + * find itself running while the interrupted CPU was doing either + * rlock or runlock in which case it will fail. + * + * Successful attempts clear the cpu id in the bitmap. + * + * In case of failure we observe all failing CPUs not executing there to + * determine when to make the next attempt. Note that threads having + * the var set have preemption disabled. Setting of readers_influx + * only uses compiler barriers making these loads unreliable, which is + * fine -- the IPI handler will always see the correct result. + * + * We retry until all counts are collected. Forward progress is + * guaranteed by that fact that the total number of threads which can + * be caught like this is finite and they all are going to block on + * their own. + */ + CPU_COPY(&all_cpus, &rmsipi.signal); + for (;;) { + smp_rendezvous_cpus( + rmsipi.signal, + smp_no_rendezvous_barrier, + rms_wlock_IPI, + smp_no_rendezvous_barrier, + &rmsipi); + + if (CPU_EMPTY(&rmsipi.signal)) + break; + + CPU_FOREACH(cpu) { + if (!CPU_ISSET(cpu, &rmsipi.signal)) + continue; + in_op = zpcpu_get_cpu(rms->readers_influx, cpu); + while (atomic_load_int(in_op)) + cpu_spinwait(); + } + } +} + +void +rms_wlock(struct rmslock *rms) +{ + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); + + mtx_lock(&rms->mtx); + rms->writers++; + if (rms->writers > 1) { + msleep(&rms->writers, &rms->mtx, PUSER - 1 | PDROP, mtx_name(&rms->mtx), 0); + MPASS(rms->readers == 0); + return; + } + + rms_wlock_switch(rms); + + if (rms->readers > 0) + msleep(&rms->writers, &rms->mtx, PUSER - 1 | PDROP, mtx_name(&rms->mtx), 0); + else + mtx_unlock(&rms->mtx); + MPASS(rms->readers == 0); +} + +void +rms_wunlock(struct rmslock *rms) +{ + + mtx_lock(&rms->mtx); + MPASS(rms->writers >= 1); + MPASS(rms->readers == 0); + rms->writers--; + if (rms->writers > 0) + wakeup_one(&rms->writers); + else + wakeup(&rms->readers); + mtx_unlock(&rms->mtx); +} Index: head/sys/sys/_rmlock.h =================================================================== --- head/sys/sys/_rmlock.h +++ head/sys/sys/_rmlock.h @@ -68,4 +68,14 @@ LIST_ENTRY(rm_priotracker) rmp_qentry; }; +#include + +struct rmslock { + struct mtx mtx; + int writers; + int readers; + int *readers_pcpu; + int *readers_influx; +}; + #endif /* !_SYS__RMLOCK_H_ */ Index: head/sys/sys/rmlock.h =================================================================== --- head/sys/sys/rmlock.h +++ head/sys/sys/rmlock.h @@ -133,5 +133,12 @@ #define rm_assert(rm, what) #endif +void rms_init(struct rmslock *rms, const char *name); +void rms_destroy(struct rmslock *rms); +void rms_rlock(struct rmslock *rms); +void rms_runlock(struct rmslock *rms); +void rms_wlock(struct rmslock *rms); +void rms_wunlock(struct rmslock *rms); + #endif /* _KERNEL */ #endif /* !_SYS_RMLOCK_H_ */