Index: sys/dev/random/fortuna.h =================================================================== --- sys/dev/random/fortuna.h +++ sys/dev/random/fortuna.h @@ -36,12 +36,14 @@ #define RANDOM_RESEED_LOCK(x) mtx_lock(&fortuna_state.fs_mtx) #define RANDOM_RESEED_UNLOCK(x) mtx_unlock(&fortuna_state.fs_mtx) #define RANDOM_RESEED_ASSERT_LOCK_OWNED(x) mtx_assert(&fortuna_state.fs_mtx, MA_OWNED) +#define RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED() mtx_assert(&fortuna_state.fs_mtx, MA_NOTOWNED) #else #define RANDOM_RESEED_INIT_LOCK(x) mtx_init(&fortuna_state.fs_mtx, mtx_plain) #define RANDOM_RESEED_DEINIT_LOCK(x) mtx_destroy(&fortuna_state.fs_mtx) #define RANDOM_RESEED_LOCK(x) mtx_lock(&fortuna_state.fs_mtx) #define RANDOM_RESEED_UNLOCK(x) mtx_unlock(&fortuna_state.fs_mtx) #define RANDOM_RESEED_ASSERT_LOCK_OWNED(x) +#define RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED() #endif #endif /* SYS_DEV_RANDOM_FORTUNA_H_INCLUDED */ Index: sys/dev/random/fortuna.c =================================================================== --- sys/dev/random/fortuna.c +++ sys/dev/random/fortuna.c @@ -61,6 +61,7 @@ #include "unit_test.h" #endif /* _KERNEL */ +#include #include #include @@ -75,7 +76,10 @@ /* Defined in FS&K */ #define RANDOM_FORTUNA_NPOOLS 32 /* The number of accumulation pools */ #define RANDOM_FORTUNA_DEFPOOLSIZE 64 /* The default pool size/length for a (re)seed */ -#define RANDOM_FORTUNA_MAX_READ (1 << 20) /* Max bytes in a single read */ +#define RANDOM_FORTUNA_MAX_READ (1 << 20) /* Max bytes from AES before rekeying */ +#define RANDOM_FORTUNA_BLOCKS_PER_KEY (1 << 16) /* Max blocks from AES before rekeying */ +CTASSERT(RANDOM_FORTUNA_BLOCKS_PER_KEY * RANDOM_BLOCKSIZE == + RANDOM_FORTUNA_MAX_READ); /* * The allowable range of RANDOM_FORTUNA_DEFPOOLSIZE. The default value is above. @@ -120,6 +124,26 @@ mtx_t fs_mtx; } fortuna_state; +/* + * Experimental concurrent reads feature. For now, disabled by default. But + * we may enable it in the future. + * + * The benefit is improved concurrency in Fortuna. That is reflected in two + * related aspects: + * + * 1. Concurrent devrandom readers can achieve similar throughput to a single + * reader thread. + * + * 2. The rand_harvestq process spends much less time spinning when one or more + * readers is processing a large request. Partially this is due to + * rand_harvestq / ra_event_processor design, which only passes one event at + * a time to the underlying algorithm. Each time, Fortuna must take its + * global state mutex, potentially blocking on a reader. Our adaptive + * mutexes assume that a lock holder currently on CPU will release the lock + * quickly, and spin if the owning thread is currently running. + */ +static bool fortuna_concurrent_read __read_frequently = false; + #ifdef _KERNEL static struct sysctl_ctx_list random_clist; RANDOM_CHECK_UINT(fs_minpoolsize, RANDOM_FORTUNA_MINPOOLSIZE, RANDOM_FORTUNA_MAXPOOLSIZE); @@ -176,6 +200,11 @@ random_check_uint_fs_minpoolsize, "IU", "Minimum pool size necessary to cause a reseed"); KASSERT(fortuna_state.fs_minpoolsize > 0, ("random: Fortuna threshold must be > 0 at startup")); + + SYSCTL_ADD_BOOL(&random_clist, SYSCTL_CHILDREN(random_fortuna_o), + OID_AUTO, "concurrent_read", CTLFLAG_RDTUN, + &fortuna_concurrent_read, 0, "If non-zero, enable EXPERIMENTAL " + "feature to improve concurrent Fortuna performance."); #endif /*- @@ -305,48 +334,6 @@ uint128_increment(&fortuna_state.fs_counter); } -/*- - * FS&K - PseudoRandomData() - * - * If Chacha20 is used, output size is unrestricted. If AES-CTR is used, - * output size MUST be <= 1MB and a multiple of RANDOM_BLOCKSIZE. The - * reasoning for this is discussed in FS&K 9.4; the significant distinction - * between the two ciphers is that AES has a *block* size of 128 bits while - * Chacha has a *block* size of 512 bits. - */ -static __inline void -random_fortuna_genrandom(uint8_t *buf, size_t bytecount) -{ - uint8_t newkey[RANDOM_KEYSIZE]; - - RANDOM_RESEED_ASSERT_LOCK_OWNED(); - - /*- - * FS&K - assert(n < 2^20 (== 1 MB)) when 128-bit block cipher is used - * - r = first-n-bytes(GenerateBlocks(ceil(n/16))) - * - K = GenerateBlocks(2) - */ - KASSERT(random_chachamode || bytecount <= RANDOM_FORTUNA_MAX_READ, - ("%s: invalid large read request: %zu bytes", __func__, - bytecount)); - - /* - * This is where FS&K would invoke GenerateBlocks(). GenerateBlocks() - * doesn't make a lot of sense or have much value if we use bytecount - * for the API (which is useful for ciphers that do not require - * block-sized output, like Chacha20). - * - * Just invoke our PRF abstraction directly, which is responsible for - * updating fs_counter ('C'). - */ - randomdev_keystream(&fortuna_state.fs_key, &fortuna_state.fs_counter, - buf, bytecount); - randomdev_keystream(&fortuna_state.fs_key, &fortuna_state.fs_counter, - newkey, sizeof(newkey)); - randomdev_encrypt_init(&fortuna_state.fs_key, newkey); - explicit_bzero(newkey, sizeof(newkey)); -} - /*- * FS&K - RandomData() (Part 1) * Used to return processed entropy from the PRNG. There is a pre_read @@ -433,70 +420,144 @@ explicit_bzero(temp, sizeof(temp)); } -/*- - * FS&K - RandomData() (Part 2) - * Main read from Fortuna, continued. May be called multiple times after - * the random_fortuna_pre_read() above. +/* + * This is basically GenerateBlocks() from FS&K. * - * The supplied buf MAY not be a multiple of RANDOM_BLOCKSIZE in size; it is - * the responsibility of the algorithm to accommodate partial block reads, if a - * block output mode is used. + * It differs in two ways: + * + * 1. Chacha20 is tolerant of non-block-multiple request sizes, so we do not + * need to handle any remainder bytes specially and can just pass the length + * directly to the PRF construction; and + * + * 2. Chacha20 is a 512-bit block size cipher (whereas AES has 128-bit block + * size, regardless of key size). This means Chacha does not require re-keying + * every 1MiB. This is implied by the math in FS&K 9.4 and mentioned + * explicitly in the conclusion, "If we had a block cipher with a 256-bit [or + * greater] block size, then the collisions would not have been an issue at + * all" (p. 144). + * + * 3. In conventional ("locked") mode, we produce a maximum of PAGE_SIZE output + * at a time before dropping the lock, to not bully the lock especially. This + * has been the status quo since 2015 (r284959). + * + * The upstream caller random_fortuna_read is responsible for zeroing out + * sensitive buffers provided as parameters to this routine. */ -void -random_fortuna_read(uint8_t *buf, size_t bytecount) +enum { + FORTUNA_UNLOCKED = false, + FORTUNA_LOCKED = true +}; +static void +random_fortuna_genbytes(uint8_t *buf, size_t bytecount, + uint8_t newkey[static RANDOM_KEYSIZE], uint128_t *p_counter, + union randomdev_key *p_key, bool locked) { uint8_t remainder_buf[RANDOM_BLOCKSIZE]; - size_t read_directly_len, read_chunk; + size_t chunk_size; - /* - * The underlying AES generator expects multiples of RANDOM_BLOCKSIZE. - */ - if (random_chachamode) - read_directly_len = bytecount; + if (locked) + RANDOM_RESEED_ASSERT_LOCK_OWNED(); else - read_directly_len = rounddown(bytecount, RANDOM_BLOCKSIZE); + RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED(); - RANDOM_RESEED_LOCK(); - KASSERT(!uint128_is_zero(fortuna_state.fs_counter), ("FS&K: C != 0")); + /* + * Easy case: don't have to worry about bullying the global mutex, + * don't have to worry about rekeying Chacha; API is byte-oriented. + */ + if (!locked && random_chachamode) { + randomdev_keystream(p_key, p_counter, buf, bytecount); + return; + } - while (read_directly_len > 0) { + if (locked) { /* - * 128-bit block ciphers like AES must be re-keyed at 1MB - * intervals to avoid unacceptable statistical differentiation - * from true random data. - * - * 512-bit block ciphers like Chacha20 do not have this - * problem. (FS&K 9.4) + * While holding the global lock, limit PRF generation to + * mitigate, but not eliminate, bullying symptoms. */ - if (random_chachamode) - read_chunk = read_directly_len; - else - read_chunk = MIN(read_directly_len, - RANDOM_FORTUNA_MAX_READ); - + chunk_size = PAGE_SIZE; + } else { /* - * For now, we hold the global Fortuna mutex, so yield - * periodically to provide vague availability to other lock - * users. PAGE_SIZE is chosen to match existing behavior. - */ - read_chunk = MIN(read_chunk, PAGE_SIZE); + * 128-bit block ciphers like AES must be re-keyed at 1MB + * intervals to avoid unacceptable statistical differentiation + * from true random data (FS&K 9.4, p. 143-144). + */ + MPASS(!random_chachamode); + chunk_size = RANDOM_FORTUNA_MAX_READ; + } - random_fortuna_genrandom(buf, read_chunk); - buf += read_chunk; - read_directly_len -= read_chunk; - bytecount -= read_chunk; + chunk_size = MIN(bytecount, chunk_size); + if (!random_chachamode) + chunk_size = rounddown(chunk_size, RANDOM_BLOCKSIZE); - /* Perform the actual yield. */ - if (read_directly_len != 0) { - RANDOM_RESEED_UNLOCK(); - RANDOM_RESEED_LOCK(); + while (bytecount >= chunk_size) { + randomdev_keystream(p_key, p_counter, buf, chunk_size); + + buf += chunk_size; + bytecount -= chunk_size; + + /* We have to rekey if there is any data remaining to be + * generated, in two scenarios: + * + * locked: we need to rekey before we unlock and release the + * global state to another consumer; or + * + * unlocked: we need to rekey because we're in AES mode and are + * required to rekey at chunk_size==1MB. But we do not need to + * rekey during the last trailing <1MB chunk. + */ + if (bytecount > 0) { + if (locked || chunk_size == RANDOM_FORTUNA_MAX_READ) { + randomdev_keystream(p_key, p_counter, newkey, + RANDOM_KEYSIZE); + randomdev_encrypt_init(p_key, newkey); + } + + /* + * If we're holding the global lock, yield it briefly + * now. + */ + if (locked) { + RANDOM_RESEED_UNLOCK(); + RANDOM_RESEED_LOCK(); + } + + /* + * At the trailing end, scale down chunk_size from 1MB or + * PAGE_SIZE to all remaining full blocks (AES) or all + * remaining bytes (Chacha). + */ + if (bytecount < chunk_size) { + if (random_chachamode) + chunk_size = bytecount; + else if (bytecount >= RANDOM_BLOCKSIZE) + chunk_size = rounddown(bytecount, + RANDOM_BLOCKSIZE); + else + break; + } } } - if (bytecount > 0) - random_fortuna_genrandom(remainder_buf, sizeof(remainder_buf)); + /* + * Generate any partial AES block remaining into a temporary buffer and + * copy the desired substring out. + */ + if (bytecount > 0) { + MPASS(!random_chachamode); - RANDOM_RESEED_UNLOCK(); + randomdev_keystream(p_key, p_counter, remainder_buf, + sizeof(remainder_buf)); + } + + /* + * In locked mode, re-key global K before dropping the lock, which we + * don't need for memcpy/bzero below. + */ + if (locked) { + randomdev_keystream(p_key, p_counter, newkey, RANDOM_KEYSIZE); + randomdev_encrypt_init(p_key, newkey); + RANDOM_RESEED_UNLOCK(); + } if (bytecount > 0) { memcpy(buf, remainder_buf, bytecount); @@ -504,6 +565,124 @@ } } + +/* + * Handle only "concurrency-enabled" Fortuna reads to simplify logic. + * + * Caller (random_fortuna_read) is responsible for zeroing out sensitive + * buffers provided as parameters to this routine. + */ +static void +random_fortuna_read_concurrent(uint8_t *buf, size_t bytecount, + uint8_t newkey[static RANDOM_KEYSIZE]) +{ + union randomdev_key key_copy; + uint128_t counter_copy; + size_t blockcount; + + MPASS(fortuna_concurrent_read); + + /* + * Compute number of blocks required for the PRF request ('delta C'). + * We will step the global counter 'C' by this number under lock, and + * then actually consume the counter values outside the lock. + * + * This ensures that contemporaneous but independent requests for + * randomness receive distinct 'C' values and thus independent PRF + * results. + */ + if (random_chachamode) { + blockcount = howmany(bytecount, CHACHA_BLOCKLEN); + } else { + blockcount = howmany(bytecount, RANDOM_BLOCKSIZE); + + /* + * Need to account for the additional blocks generated by + * rekeying when updating the global fs_counter. + */ + blockcount += RANDOM_KEYS_PER_BLOCK * + (blockcount / RANDOM_FORTUNA_BLOCKS_PER_KEY); + } + + RANDOM_RESEED_LOCK(); + KASSERT(!uint128_is_zero(fortuna_state.fs_counter), ("FS&K: C != 0")); + /* + * Technically, we only need mutual exclusion to update shared state + * appropriately. Nothing about updating the shared internal state + * requires that we perform (most) expensive cryptographic keystream + * generation under lock. (We still need to generate 256 bits of + * keystream to re-key between consumers.) + * + * Save the original counter and key values that will be used as the + * PRF for this particular consumer. + */ + memcpy(&counter_copy, &fortuna_state.fs_counter, sizeof(counter_copy)); + memcpy(&key_copy, &fortuna_state.fs_key, sizeof(key_copy)); + + /* + * Step the counter as if we had generated 'bytecount' blocks for this + * consumer. I.e., ensure that the next consumer gets an independent + * range of counter values once we drop the global lock. + */ + uint128_add(&fortuna_state.fs_counter, blockcount); + + /* + * We still need to Rekey the global 'K' between independent calls; + * this is no different from conventional Fortuna. Note that + * 'randomdev_keystream()' will step the fs_counter 'C' appropriately + * for the blocks needed for the 'newkey'. + * + * (This is part of PseudoRandomData() in FS&K, 9.4.4.) + */ + randomdev_keystream(&fortuna_state.fs_key, &fortuna_state.fs_counter, + newkey, RANDOM_KEYSIZE); + randomdev_encrypt_init(&fortuna_state.fs_key, newkey); + + /* + * We have everything we need to generate a unique PRF for this + * consumer without touching global state. + */ + RANDOM_RESEED_UNLOCK(); + + random_fortuna_genbytes(buf, bytecount, newkey, &counter_copy, + &key_copy, FORTUNA_UNLOCKED); + RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED(); + + explicit_bzero(&counter_copy, sizeof(counter_copy)); + explicit_bzero(&key_copy, sizeof(key_copy)); +} + +/*- + * FS&K - RandomData() (Part 2) + * Main read from Fortuna, continued. May be called multiple times after + * the random_fortuna_pre_read() above. + * + * The supplied buf MAY not be a multiple of RANDOM_BLOCKSIZE in size; it is + * the responsibility of the algorithm to accommodate partial block reads, if a + * block output mode is used. + */ +void +random_fortuna_read(uint8_t *buf, size_t bytecount) +{ + uint8_t newkey[RANDOM_KEYSIZE]; + + if (fortuna_concurrent_read) { + random_fortuna_read_concurrent(buf, bytecount, newkey); + goto out; + } + + RANDOM_RESEED_LOCK(); + KASSERT(!uint128_is_zero(fortuna_state.fs_counter), ("FS&K: C != 0")); + + random_fortuna_genbytes(buf, bytecount, newkey, + &fortuna_state.fs_counter, &fortuna_state.fs_key, FORTUNA_LOCKED); + /* Returns unlocked */ + RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED(); + +out: + explicit_bzero(newkey, sizeof(newkey)); +} + #ifdef _KERNEL static bool block_seeded_status = false; SYSCTL_BOOL(_kern_random, OID_AUTO, block_seeded_status, CTLFLAG_RWTUN, Index: sys/dev/random/hash.h =================================================================== --- sys/dev/random/hash.h +++ sys/dev/random/hash.h @@ -32,6 +32,10 @@ #include #include +#ifndef _KERNEL +#define __read_frequently +#endif + /* Keys are formed from cipher blocks */ #define RANDOM_KEYSIZE 32 /* (in bytes) == 256 bits */ #define RANDOM_KEYSIZE_WORDS (RANDOM_KEYSIZE/sizeof(uint32_t)) Index: sys/dev/random/hash.c =================================================================== --- sys/dev/random/hash.c +++ sys/dev/random/hash.c @@ -74,7 +74,7 @@ * Benefits include somewhat faster keystream generation compared with * unaccelerated AES-ICM. */ -bool random_chachamode = false; +bool random_chachamode __read_frequently = false; #ifdef _KERNEL SYSCTL_BOOL(_kern_random, OID_AUTO, use_chacha20_cipher, CTLFLAG_RDTUN, &random_chachamode, 0, Index: sys/dev/random/uint128.h =================================================================== --- sys/dev/random/uint128.h +++ sys/dev/random/uint128.h @@ -65,6 +65,21 @@ #endif } +static __inline void +uint128_add(uint128_t *big_uintp, uint64_t add) +{ +#ifdef USE_REAL_UINT128_T + (*big_uintp) += add; +#else + uint64_t word0p; + + word0p = big_uintp->u128t_word0 + add; + if (word0p < big_uintp->u128t_word0) + big_uintp->u128t_word1++; + big_uintp->u128t_word0 = word0p; +#endif +} + static __inline bool uint128_equals(uint128_t a, uint128_t b) { Index: tests/sys/devrandom/uint128_test.c =================================================================== --- tests/sys/devrandom/uint128_test.c +++ tests/sys/devrandom/uint128_test.c @@ -152,6 +152,62 @@ } } +ATF_TC_WITHOUT_HEAD(uint128_add64); +ATF_TC_BODY(uint128_add64, tc) +{ + static const struct u128_add64_tc { + uint32_t input[4]; + uint64_t addend; + uint32_t expected[4]; + const char *descr; + } tests[] = { + { + .input = { 0, 0, 0, 0 }, + .addend = 1, + .expected = { 1, 0, 0, 0 }, + .descr = "0 + 1 -> 1", + }, + { + .input = { 1, 0, 0, 0 }, + .addend = UINT32_MAX, + .expected = { 0, 1, 0, 0 }, + .descr = "1 + (2^32 - 1) -> 2^32 (word carry)", + }, + { + .input = { 1, 0, 0, 0 }, + .addend = UINT64_MAX, + .expected = { 0, 0, 1, 0 }, + .descr = "1 + (2^64 - 1) -> 2^64 (u128t_word0 carry)", + }, + { + .input = { 0x11111111, 0x11111111, 0, 0 }, + .addend = 0xf0123456789abcdeULL, + .expected = { 0x89abcdef, 0x01234567, 1, 0 }, + .descr = "0x1111_1111_1111_1111 +" + "0xf012_3456_789a_bcde ->" + "0x1_0123_4567_89ab_cdef", + }, + { + .input = { 1, 0, UINT32_MAX, 0 }, + .addend = UINT64_MAX, + .expected = { 0, 0, 0, 1 }, + .descr = "Carry ~2^96", + }, + }; + uint8_t inputle[16], expectedle[16]; + uint128_t a; + size_t i; + + for (i = 0; i < nitems(tests); i++) { + vec_u32_tole128(inputle, tests[i].input); + vec_u32_tole128(expectedle, tests[i].expected); + + a = le128dec(inputle); + uint128_add(&a, tests[i].addend); + u128_check_equality(le128dec(expectedle), a, tests[i].descr); + } +} + /* * Test assumptions about Chacha incrementing counter in the same way as * uint128.h @@ -219,6 +275,7 @@ { ATF_TP_ADD_TC(tp, uint128_inc); + ATF_TP_ADD_TC(tp, uint128_add64); ATF_TP_ADD_TC(tp, uint128_chacha_ctr); return (atf_no_error()); }