diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -78,6 +78,7 @@
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pagequeue.h>
 
 struct ktls_wq {
 	struct mtx	mtx;
@@ -87,9 +88,17 @@
 	int		lastallocfail;
 } __aligned(CACHE_LINE_SIZE);
 
+struct ktls_alloc_thread {
+	uint64_t wakeups;
+	uint64_t allocs;
+	struct thread *td;
+	int running;
+};
+
 struct ktls_domain_info {
 	int count;
 	int cpu[MAXCPU];
+	struct ktls_alloc_thread alloc_td;
 };
 
 struct ktls_domain_info ktls_domains[MAXMEMDOM];
@@ -142,6 +151,11 @@
     &ktls_sw_buffer_cache, 1,
     "Enable caching of output buffers for SW encryption");
 
+static int ktls_max_alloc = 128;
+SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN,
+    &ktls_max_alloc, 128,
+    "Max number of 16k buffers to allocate in thread context");
+
 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
@@ -278,6 +292,7 @@
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
+static void ktls_alloc_thread(void *ctx);
 
 #if defined(INET) || defined(INET6)
 static u_int
@@ -418,6 +433,32 @@
 		ktls_number_threads++;
 	}
 
+	/*
+	 * Start an allocation thread per-domain to perform blocking allocations
+	 * of 16k physically contiguous TLS crypto destination buffers.
+	 */
+	if (ktls_sw_buffer_cache) {
+		for (domain = 0; domain < vm_ndomains; domain++) {
+			if (VM_DOMAIN_EMPTY(domain))
+				continue;
+			if (CPU_EMPTY(&cpuset_domain[domain]))
+				continue;
+			error = kproc_kthread_add(ktls_alloc_thread,
+			    &ktls_domains[domain], &ktls_proc,
+			    &ktls_domains[domain].alloc_td.td,
+			    0, 0, "KTLS", "alloc_%d", domain);
+			if (error)
+				panic("Can't add KTLS alloc thread %d error %d",
+				    domain, error);
+			CPU_COPY(&cpuset_domain[domain], &mask);
+			error = cpuset_setthread(ktls_domains[domain].alloc_td.td->td_tid,
+			    &mask);
+			if (error)
+				panic("Unable to bind KTLS alloc %d error %d",
+				    domain, error);
+		}
+	}
+
 	/*
 	 * If we somehow have an empty domain, fall back to choosing
 	 * among all KTLS threads.
@@ -1946,6 +1987,7 @@
 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
 {
 	void *buf;
+	int domain, running;
 
 	if (m->m_epg_npgs <= 2)
 		return (NULL);
@@ -1961,8 +2003,23 @@
 		return (NULL);
 	}
 	buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
-	if (buf == NULL)
+	if (buf == NULL) {
+		domain = PCPU_GET(domain);
 		wq->lastallocfail = ticks;
+
+		/*
+		 * Note that this check is "racy", but the races are
+		 * harmless, and are either a spurious wakeup if
+		 * multiple threads fail allocations before the alloc
+		 * thread wakes, or waiting an extra second in case we
+		 * see an old value of running == true.
+		 */
+		if (!VM_DOMAIN_EMPTY(domain)) {
+			running = atomic_load_int(&ktls_domains[domain].alloc_td.running);
+			if (!running)
+				wakeup(&ktls_domains[domain].alloc_td);
+		}
+	}
 	return (buf);
 }
 
@@ -2154,6 +2211,68 @@
 	CURVNET_RESTORE();
 }
 
+static void
+ktls_alloc_thread(void *ctx)
+{
+	struct ktls_domain_info *ktls_domain = ctx;
+	struct ktls_alloc_thread *sc = &ktls_domain->alloc_td;
+	void **buf;
+	struct sysctl_oid *oid;
+	char name[80];
+	int i, nbufs;
+
+	curthread->td_domain.dr_policy =
+	    DOMAINSET_PREF(PCPU_GET(domain));
+	snprintf(name, sizeof(name), "domain%d", PCPU_GET(domain));
+	if (bootverbose)
+		printf("Starting KTLS alloc thread for domain %d\n",
+		    PCPU_GET(domain));
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO,
+	    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
+	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs",
+	    CTLFLAG_RD,  &sc->allocs, 0, "buffers allocated");
+	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups",
+	    CTLFLAG_RD,  &sc->wakeups, 0, "thread wakeups");
+	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running",
+	    CTLFLAG_RD,  &sc->running, 0, "thread running");
+
+	buf = NULL;
+	nbufs = 0;
+	for (;;) {
+		atomic_store_int(&sc->running, 0);
+		tsleep(sc, PZERO, "waiting for work", 0);
+		atomic_store_int(&sc->running, 1);
+		sc->wakeups++;
+		if (nbufs != ktls_max_alloc) {
+			free(buf, M_KTLS);
+			nbufs = atomic_load_int(&ktls_max_alloc);
+			buf = malloc(sizeof(void *) * nbufs, M_KTLS,
+			    M_WAITOK | M_ZERO);
+		}
+		/*
+		 * Below we allocate nbufs with different allocation
+		 * flags than we use when allocating normally during
+		 * encryption in the ktls worker thread.  We specify
+		 * M_NORECLAIM in the worker thread. However, we omit
+		 * that flag here and add M_WAITOK so that the VM
+		 * system is permitted to perform expensive work to
+		 * defragment memory.  We do this here, as it does not
+		 * matter if this thread blocks.  If we block a ktls
+		 * worker thread, we risk developing backlogs of
+		 * buffers to be encrypted, leading to surges of
+		 * traffic and potential NIC output drops.
+		 */
+		for (i = 0; i < nbufs; i++) {
+			buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK);
+			sc->allocs++;
+		}
+		for (i = 0; i < nbufs; i++) {
+			uma_zfree(ktls_buffer_zone, buf[i]);
+			buf[i] = NULL;
+		}
+	}
+}
+
 static void
 ktls_work_thread(void *ctx)
 {