Index: sys/geom/eli/g_eli.h =================================================================== --- sys/geom/eli/g_eli.h +++ sys/geom/eli/g_eli.h @@ -123,7 +123,15 @@ /* Provider uses IV-Key for encryption key generation. */ #define G_ELI_FLAG_ENC_IVKEY 0x00400000 -#define G_ELI_NEW_BIO 255 +/* BIO pflag values. */ +#define G_ELI_WORKER(pflags) ((pflags) & 0xff) +#define G_ELI_MAX_WORKERS 255 +#define G_ELI_NEW_BIO G_ELI_MAX_WORKERS +#define G_ELI_SETWORKER(pflags, w) \ + (pflags) = ((pflags) & 0xff00) | ((w) & 0xff) +#define G_ELI_SET_NEW_BIO(pflags) G_ELI_SETWORKER((pflags), G_ELI_NEW_BIO) +#define G_ELI_IS_NEW_BIO(pflags) (G_ELI_WORKER(pflags) == G_ELI_NEW_BIO) +#define G_ELI_UMA_ALLOC 0x100 /* bio_driver2 alloc came from UMA */ #define SHA512_MDLEN 64 #define G_ELI_AUTH_SECKEYLEN SHA256_DIGEST_LENGTH @@ -692,6 +700,9 @@ void g_eli_read_done(struct bio *bp); void g_eli_write_done(struct bio *bp); int g_eli_crypto_rerun(struct cryptop *crp); + +bool g_eli_alloc_data(struct bio *bp, int sz); +void g_eli_free_data(struct bio *bp); void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker); void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp); Index: sys/geom/eli/g_eli.c =================================================================== --- sys/geom/eli/g_eli.c +++ sys/geom/eli/g_eli.c @@ -50,6 +50,8 @@ #include #include +#include +#include #include #include @@ -87,8 +89,50 @@ u_int g_eli_batch = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0, "Use crypto operations batching"); +static u_int g_eli_minbufs = 16; +static int sysctl_g_eli_minbufs(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_kern_geom_eli, OID_AUTO, minbufs, CTLTYPE_UINT | CTLFLAG_RW | + CTLFLAG_MPSAFE, NULL, 0, sysctl_g_eli_minbufs, "IU", + "Number of GELI bufs reserved for swap transactions"); +static struct sx g_eli_umalock; /* Controls changes to UMA zone. */ +SX_SYSINIT(g_eli_umalock, &g_eli_umalock, "GELI UMA"); +static uma_zone_t g_eli_uma = NULL; +static int g_eli_alloc_sz; +static volatile int g_eli_umaoutstanding; +static volatile int g_eli_devs; +static bool g_eli_blocking_malloc = true; +SYSCTL_BOOL(_kern_geom_eli, OID_AUTO, blocking_malloc, CTLFLAG_RWTUN, + &g_eli_blocking_malloc, 0, "Use blocking malloc calls for GELI buffers"); /* + * Control the number of reserved entries in the GELI zone. + * If the GELI zone has already been allocated, update the zone. Otherwise, + * simply update the variable for use the next time the zone is created. + */ +static int +sysctl_g_eli_minbufs(SYSCTL_HANDLER_ARGS) +{ + int error; + u_int new; + + new = g_eli_minbufs; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + sx_xlock(&g_eli_umalock); + if (g_eli_uma != NULL) { + if (new != g_eli_minbufs) + uma_zone_reserve(g_eli_uma, new); + if (new > g_eli_minbufs) + uma_prealloc(g_eli_uma, new - g_eli_minbufs); + } + if (new != g_eli_minbufs) + g_eli_minbufs = new; + sx_xunlock(&g_eli_umalock); + return (0); +} + +/* * Passphrase cached during boot, in order to be more user-friendly if * there are multiple providers using the same passphrase. */ @@ -200,10 +244,11 @@ bp = (struct bio *)crp->crp_opaque; sc = bp->bio_to->geom->softc; LIST_FOREACH(wr, &sc->sc_workers, w_next) { - if (wr->w_number == bp->bio_pflags) + if (wr->w_number == G_ELI_WORKER(bp->bio_pflags)) break; } - KASSERT(wr != NULL, ("Invalid worker (%u).", bp->bio_pflags)); + KASSERT(wr != NULL, ("Invalid worker (%u).", + G_ELI_WORKER(bp->bio_pflags))); G_ELI_DEBUG(1, "Rerunning crypto %s request (sid: %p -> %p).", bp->bio_cmd == BIO_READ ? "READ" : "WRITE", wr->w_sid, crp->crp_session); @@ -254,10 +299,7 @@ G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); pbp->bio_completed = 0; - if (pbp->bio_driver2 != NULL) { - free(pbp->bio_driver2, M_ELI); - pbp->bio_driver2 = NULL; - } + g_eli_free_data(pbp); g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); @@ -291,8 +333,8 @@ pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; - free(pbp->bio_driver2, M_ELI); - pbp->bio_driver2 = NULL; + sc = pbp->bio_to->geom->softc; + g_eli_free_data(pbp); if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); @@ -303,7 +345,6 @@ /* * Write is finished, send it up. */ - sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); @@ -453,7 +494,8 @@ return; } bp->bio_driver1 = cbp; - bp->bio_pflags = G_ELI_NEW_BIO; + bp->bio_pflags = 0; + G_ELI_SET_NEW_BIO(bp->bio_pflags); switch (bp->bio_cmd) { case BIO_READ: if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) { @@ -576,7 +618,7 @@ mtx_assert(&sc->sc_queue_mtx, MA_OWNED); while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) { - KASSERT(bp->bio_pflags == G_ELI_NEW_BIO, + KASSERT(G_ELI_IS_NEW_BIO(bp->bio_pflags), ("Not new bio when canceling (bp=%p).", bp)); g_io_deliver(bp, ENXIO); } @@ -595,7 +637,7 @@ * Device suspended, so we skip new I/O requests. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { - if (bp->bio_pflags != G_ELI_NEW_BIO) + if (!G_ELI_IS_NEW_BIO(bp->bio_pflags)) break; } if (bp != NULL) @@ -688,11 +730,11 @@ msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0); continue; } - if (bp->bio_pflags == G_ELI_NEW_BIO) + if (G_ELI_IS_NEW_BIO(bp->bio_pflags)) atomic_add_int(&sc->sc_inflight, 1); mtx_unlock(&sc->sc_queue_mtx); - if (bp->bio_pflags == G_ELI_NEW_BIO) { - bp->bio_pflags = 0; + if (G_ELI_IS_NEW_BIO(bp->bio_pflags)) { + G_ELI_SETWORKER(bp->bio_pflags, 0); if (sc->sc_flags & G_ELI_FLAG_AUTH) { if (bp->bio_cmd == BIO_READ) g_eli_auth_read(sc, bp); @@ -834,6 +876,132 @@ #endif } +static void +g_eli_init_uma(void) +{ + + atomic_add_int(&g_eli_devs, 1); + sx_xlock(&g_eli_umalock); + if (g_eli_uma == NULL) { + /* + * Calculate the maximum-sized swap buffer we are + * likely to see. + */ + g_eli_alloc_sz = roundup2((PAGE_SIZE + sizeof(int) + + G_ELI_AUTH_SECKEYLEN) * nsw_cluster_max + + sizeof(uintptr_t), PAGE_SIZE); + + /* + * Create the zone, setting UMA_ZONE_NOFREE so we won't + * drain the zone in a memory shortage. + */ + g_eli_uma = uma_zcreate("GELI buffers", g_eli_alloc_sz, + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + + /* Reserve and pre-allocate pages, as appropriate. */ + uma_zone_reserve(g_eli_uma, g_eli_minbufs); + uma_prealloc(g_eli_uma, g_eli_minbufs); + } + sx_xunlock(&g_eli_umalock); +} + +/* + * Try to destroy the UMA pool. This will do nothing if there are existing + * GELI devices or existing UMA allocations. + */ +static void +g_eli_destroy_uma(void) +{ + uma_zone_t oldzone; + + sx_xlock(&g_eli_umalock); + /* Ensure we really should be destroying this. */ + if (atomic_load_int(&g_eli_devs) == 0 && + atomic_load_int(&g_eli_umaoutstanding) == 0) { + oldzone = g_eli_uma; + g_eli_uma = NULL; + } else + oldzone = NULL; + sx_xunlock(&g_eli_umalock); + + if (oldzone != NULL) + uma_zdestroy(oldzone); +} + +static void +g_eli_fini_uma(void) +{ + + /* + * If this is the last outstanding GELI device, try to + * destroy the UMA pool. + */ + if (atomic_fetchadd_int(&g_eli_devs, -1) == 1) + g_eli_destroy_uma(); +} + +/* + * Allocate a data buffer. If the size fits within our swap-sized buffers, + * try to allocate a swap-sized buffer from the UMA pool. Otherwise, fall + * back to using malloc. + * + * Swap-related requests are special: they can only use the UMA pool, they + * use M_USE_RESERVE to let them dip farther into system resources, and + * they always use M_NOWAIT to prevent swap operations from deadlocking. + */ +bool +g_eli_alloc_data(struct bio *bp, int sz) +{ + + KASSERT(sz <= g_eli_alloc_sz || (bp->bio_flags & BIO_SWAP) == 0, + ("BIO_SWAP request for %d bytes exceeds the precalculated buffer" + " size (%d)", sz, g_eli_alloc_sz)); + if (sz <= g_eli_alloc_sz) { + bp->bio_driver2 = uma_zalloc(g_eli_uma, M_NOWAIT | + ((bp->bio_flags & BIO_SWAP) != 0 ? M_USE_RESERVE : 0)); + if (bp->bio_driver2 != NULL) { + bp->bio_pflags |= G_ELI_UMA_ALLOC; + atomic_add_int(&g_eli_umaoutstanding, 1); + } + if (bp->bio_driver2 != NULL || (bp->bio_flags & BIO_SWAP) != 0) + return (bp->bio_driver2 != NULL); + } + bp->bio_pflags &= ~(G_ELI_UMA_ALLOC); + bp->bio_driver2 = malloc(sz, M_ELI, g_eli_blocking_malloc ? M_WAITOK : + M_NOWAIT); + return (bp->bio_driver2 != NULL); +} + +/* + * Free a buffer from bp->bio_driver2 which was allocated with + * g_eli_alloc_data(). This function makes sure that the memory is freed + * to the correct place. + * + * Additionally, if this function frees the last outstanding UMA request + * and there are no open GELI devices, this will destroy the UMA pool. + */ +void +g_eli_free_data(struct bio *bp) +{ + + /* + * Mimic the free(9) behavior of allowing a NULL pointer to be + * freed. + */ + if (bp->bio_driver2 == NULL) + return; + + if ((bp->bio_pflags & G_ELI_UMA_ALLOC) != 0) { + uma_zfree(g_eli_uma, bp->bio_driver2); + if (atomic_fetchadd_int(&g_eli_umaoutstanding, -1) == 1 && + atomic_load_int(&g_eli_devs) == 0) + g_eli_destroy_uma(); + } else + free(bp->bio_driver2, M_ELI); + bp->bio_driver2 = NULL; +} + struct g_geom * g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, const struct g_eli_metadata *md, const u_char *mkey, int nkey) @@ -922,6 +1090,7 @@ if (threads == 0) threads = mp_ncpus; sc->sc_cpubind = (mp_ncpus > 1 && threads == mp_ncpus); + g_eli_init_uma(); for (i = 0; i < threads; i++) { if (g_eli_cpu_is_disabled(i)) { G_ELI_DEBUG(1, "%s: CPU %u disabled, skipping.", @@ -1017,6 +1186,7 @@ g_destroy_consumer(cp); g_destroy_geom(gp); g_eli_key_destroy(sc); + g_eli_fini_uma(); bzero(sc, sizeof(*sc)); free(sc, M_ELI); return (NULL); @@ -1061,6 +1231,7 @@ mtx_destroy(&sc->sc_queue_mtx); gp->softc = NULL; g_eli_key_destroy(sc); + g_eli_fini_uma(); bzero(sc, sizeof(*sc)); free(sc, M_ELI); Index: sys/geom/eli/g_eli_integrity.c =================================================================== --- sys/geom/eli/g_eli_integrity.c +++ sys/geom/eli/g_eli_integrity.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -104,8 +103,6 @@ * g_eli_start -> g_eli_auth_run -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ -MALLOC_DECLARE(M_ELI); - /* * Here we generate key for HMAC. Every sector has its own HMAC key, so it is * not possible to copy sectors. @@ -268,8 +265,7 @@ sc->sc_name, (intmax_t)corsize, (intmax_t)coroff); } } - free(bp->bio_driver2, M_ELI); - bp->bio_driver2 = NULL; + g_eli_free_data(bp); if (bp->bio_error != 0) { if (bp->bio_error != EINTEGRITY) { G_ELI_LOGREQ(0, bp, @@ -326,8 +322,7 @@ if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); - free(bp->bio_driver2, M_ELI); - bp->bio_driver2 = NULL; + g_eli_free_data(bp); cbp = bp->bio_driver1; bp->bio_driver1 = NULL; g_destroy_bio(cbp); @@ -386,7 +381,7 @@ size_t size; off_t nsec; - bp->bio_pflags = 0; + G_ELI_SETWORKER(bp->bio_pflags, 0); cp = LIST_FIRST(&sc->sc_geom->consumer); cbp = bp->bio_driver1; @@ -404,7 +399,14 @@ size += sizeof(int) * nsec; size += G_ELI_AUTH_SECKEYLEN * nsec; cbp->bio_offset = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector; - bp->bio_driver2 = malloc(size, M_ELI, M_WAITOK); + if (!g_eli_alloc_data(bp, size)) { + G_ELI_LOGREQ(0, bp, "Crypto auth read request failed (ENOMEM)"); + g_destroy_bio(cbp); + bp->bio_error = ENOMEM; + g_io_deliver(bp, bp->bio_error); + atomic_subtract_int(&sc->sc_inflight, 1); + return; + } cbp->bio_data = bp->bio_driver2; /* Clear the error array. */ @@ -457,7 +459,7 @@ G_ELI_LOGREQ(3, bp, "%s", __func__); - bp->bio_pflags = wr->w_number; + G_ELI_SETWORKER(bp->bio_pflags, wr->w_number); sc = wr->w_softc; /* Sectorsize of decrypted provider eg. 4096. */ decr_secsize = bp->bio_to->sectorsize; @@ -485,8 +487,19 @@ size = encr_secsize * nsec; size += G_ELI_AUTH_SECKEYLEN * nsec; size += sizeof(uintptr_t); /* Space for alignment. */ - data = malloc(size, M_ELI, M_WAITOK); - bp->bio_driver2 = data; + if (!g_eli_alloc_data(bp, size)) { + G_ELI_LOGREQ(0, bp, "Crypto request failed (ENOMEM)"); + if (bp->bio_driver1 != NULL) { + g_destroy_bio(bp->bio_driver1); + bp->bio_driver1 = NULL; + } + bp->bio_error = ENOMEM; + g_io_deliver(bp, bp->bio_error); + if (sc != NULL) + atomic_subtract_int(&sc->sc_inflight, 1); + return; + } + data = bp->bio_driver2; p = data + encr_secsize * nsec; } bp->bio_inbed = 0; Index: sys/geom/eli/g_eli_privacy.c =================================================================== --- sys/geom/eli/g_eli_privacy.c +++ sys/geom/eli/g_eli_privacy.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -60,8 +59,6 @@ * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ -MALLOC_DECLARE(M_ELI); - /* * The function is called after we read and decrypt data. * @@ -98,8 +95,7 @@ */ if (bp->bio_inbed < bp->bio_children) return (0); - free(bp->bio_driver2, M_ELI); - bp->bio_driver2 = NULL; + g_eli_free_data(bp); if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto READ request failed (error=%d).", bp->bio_error); @@ -159,8 +155,7 @@ if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); - free(bp->bio_driver2, M_ELI); - bp->bio_driver2 = NULL; + g_eli_free_data(bp); g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); @@ -209,7 +204,7 @@ atomic_add_int(&sc->sc_inflight, 1); mtx_unlock(&sc->sc_queue_mtx); } - bp->bio_pflags = 0; + G_ELI_SETWORKER(bp->bio_pflags, 0); bp->bio_driver2 = NULL; cbp = bp->bio_driver1; cbp->bio_done = g_eli_read_done; @@ -243,7 +238,7 @@ G_ELI_LOGREQ(3, bp, "%s", __func__); - bp->bio_pflags = wr->w_number; + G_ELI_SETWORKER(bp->bio_pflags, wr->w_number); sc = wr->w_softc; secsize = LIST_FIRST(&sc->sc_geom->provider)->sectorsize; nsec = bp->bio_length / secsize; @@ -255,9 +250,20 @@ * If we write the data we cannot destroy current bio_data content, * so we need to allocate more memory for encrypted data. */ + if (bp->bio_cmd == BIO_WRITE && !g_eli_alloc_data(bp, bp->bio_length)) { + G_ELI_LOGREQ(0, bp, "Crypto request failed (ENOMEM)."); + if (bp->bio_driver1 != NULL) { + g_destroy_bio(bp->bio_driver1); + bp->bio_driver1 = NULL; + } + bp->bio_error = ENOMEM; + g_io_deliver(bp, bp->bio_error); + if (sc != NULL) + atomic_subtract_int(&sc->sc_inflight, 1); + return; + } if (bp->bio_cmd == BIO_WRITE) { - data = malloc(bp->bio_length, M_ELI, M_WAITOK); - bp->bio_driver2 = data; + data = bp->bio_driver2; bcopy(bp->bio_data, data, bp->bio_length); } else data = bp->bio_data; Index: sys/geom/geom_io.c =================================================================== --- sys/geom/geom_io.c +++ sys/geom/geom_io.c @@ -199,12 +199,12 @@ /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. - * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly - * indicate which way the buffer is passed. + * BIO_UNMAPPED, BIO_VLIST, and BIO_SWAP should be inherited, + * to properly indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & - (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); + (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST | BIO_SWAP); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; @@ -238,7 +238,7 @@ struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); - bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); + bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST | BIO_SWAP); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; Index: sys/sys/bio.h =================================================================== --- sys/sys/bio.h +++ sys/sys/bio.h @@ -67,8 +67,9 @@ #define BIO_UNMAPPED 0x10 #define BIO_TRANSIENT_MAPPING 0x20 #define BIO_VLIST 0x40 +#define BIO_SWAP 0x200 /* Swap-related I/O */ -#define PRINT_BIO_FLAGS "\20\7vlist\6transient_mapping\5unmapped" \ +#define PRINT_BIO_FLAGS "\20\12swap\7vlist\6transient_mapping\5unmapped" \ "\4ordered\3onqueue\2done\1error" #define BIO_SPEEDUP_WRITE 0x4000 /* Resource shortage at upper layers */ Index: sys/vm/swap_pager.h =================================================================== --- sys/vm/swap_pager.h +++ sys/vm/swap_pager.h @@ -71,6 +71,7 @@ #ifdef _KERNEL extern int swap_pager_avail; +extern int nsw_cluster_max; struct xswdev; int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len); Index: sys/vm/swap_pager.c =================================================================== --- sys/vm/swap_pager.c +++ sys/vm/swap_pager.c @@ -340,7 +340,7 @@ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ -static int nsw_cluster_max; /* maximum VOP I/O allowed */ +int nsw_cluster_max = MIN(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); /* maximum VOP I/O allowed */ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | @@ -560,8 +560,6 @@ * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ - nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); - nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); @@ -2824,6 +2822,7 @@ bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; + bio->bio_flags |= BIO_SWAP; if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf;