Changeset View
Changeset View
Standalone View
Standalone View
sys/crypto/blake2/blake2_cryptodev.c
Show All 23 Lines | |||||
* SUCH DAMAGE. | * SUCH DAMAGE. | ||||
*/ | */ | ||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include <sys/param.h> | #include <sys/param.h> | ||||
#include <sys/bus.h> | #include <sys/bus.h> | ||||
#include <sys/domainset.h> | |||||
#include <sys/kernel.h> | #include <sys/kernel.h> | ||||
#include <sys/kobj.h> | #include <sys/kobj.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/module.h> | #include <sys/module.h> | ||||
#include <sys/mutex.h> | #include <sys/mutex.h> | ||||
#include <sys/rwlock.h> | #include <sys/rwlock.h> | ||||
#include <sys/smp.h> | #include <sys/smp.h> | ||||
▲ Show 20 Lines • Show All 96 Lines • ▼ Show 20 Lines | blake2_attach(device_t dev) | ||||
} | } | ||||
ctx_mtx = malloc(sizeof(*ctx_mtx) * (mp_maxid + 1), M_BLAKE2, | ctx_mtx = malloc(sizeof(*ctx_mtx) * (mp_maxid + 1), M_BLAKE2, | ||||
M_WAITOK | M_ZERO); | M_WAITOK | M_ZERO); | ||||
ctx_fpu = malloc(sizeof(*ctx_fpu) * (mp_maxid + 1), M_BLAKE2, | ctx_fpu = malloc(sizeof(*ctx_fpu) * (mp_maxid + 1), M_BLAKE2, | ||||
M_WAITOK | M_ZERO); | M_WAITOK | M_ZERO); | ||||
CPU_FOREACH(i) { | CPU_FOREACH(i) { | ||||
ctx_fpu[i] = fpu_kern_alloc_ctx(0); | ctx_fpu[i] = fpu_kern_alloc_ctx_domainset( | ||||
DOMAINSET_PREF(pcpu_find(i)->pc_domain), FPU_KERN_NORMAL); | |||||
mtx_init(&ctx_mtx[i], "bl2fpumtx", NULL, MTX_DEF | MTX_NEW); | mtx_init(&ctx_mtx[i], "bl2fpumtx", NULL, MTX_DEF | MTX_NEW); | ||||
markj: You might want to use DPCPU for the mutexes while you're here, so that they automatically get… | |||||
cemAuthorUnsubmitted Done Inline ActionsRather than do that in N places, I'd rather get this in, centralize the whole 'accelerated code using PCPU FPU contexts and locks' idiom, and refactor aesni and blake2 to both use it. We don't need per-driver versions of these. cem: Rather than do that in N places, I'd rather get this in, centralize the whole 'accelerated code… | |||||
markjUnsubmitted Not Done Inline ActionsSounds reasonable to me. markj: Sounds reasonable to me. | |||||
} | } | ||||
rw_init(&sc->lock, "blake2_lock"); | rw_init(&sc->lock, "blake2_lock"); | ||||
return (0); | return (0); | ||||
} | } | ||||
static int | static int | ||||
▲ Show 20 Lines • Show All 259 Lines • Show Last 20 Lines |
You might want to use DPCPU for the mutexes while you're here, so that they automatically get placed on domain-local memory. I see now that that doesn't quite work for the context structures since their size isn't known at compile-time, at least on x86, so you'd have to define a maximum size for the fpu context.