diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -87,9 +87,17 @@
 	int		lastallocfail;
 } __aligned(CACHE_LINE_SIZE);
 
+struct ktls_alloc_thread {
+	uint64_t wakeups;
+	uint64_t allocs;
+	struct thread *td;
+	bool running;
+};
+
 struct ktls_domain_info {
 	int count;
 	int cpu[MAXCPU];
+	struct ktls_alloc_thread alloc_td;
 };
 
 struct ktls_domain_info ktls_domains[MAXMEMDOM];
@@ -142,6 +150,11 @@
     &ktls_sw_buffer_cache, 1,
     "Enable caching of output buffers for SW encryption");
 
+static int ktls_max_alloc = 128;
+SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN,
+    &ktls_max_alloc, 128,
+    "Max number of 16k buffers to allocate in thread context");
+
 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
@@ -278,6 +291,7 @@
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
+static void ktls_alloc_thread(void *ctx);
 
 #if defined(INET) || defined(INET6)
 static u_int
@@ -397,10 +411,10 @@
 		 * Bind threads to cores.  If ktls_bind_threads is >
 		 * 1, then we bind to the NUMA domain.
 		 */
+		pc = pcpu_find(i);
+		domain = pc->pc_domain;
 		if (ktls_bind_threads) {
 			if (ktls_bind_threads > 1) {
-				pc = pcpu_find(i);
-				domain = pc->pc_domain;
 				CPU_COPY(&cpuset_domain[domain], &mask);
 				count = ktls_domains[domain].count;
 				ktls_domains[domain].cpu[count] = i;
@@ -414,6 +428,23 @@
 			    "Unable to bind KTLS thread for CPU %d error %d",
 				     i, error);
 		}
+		if (ktls_sw_buffer_cache &&
+		    ktls_domains[domain].alloc_td.td == NULL) {
+			error = kproc_kthread_add(ktls_alloc_thread,
+			    &ktls_domains[domain], &ktls_proc,
+			    &ktls_domains[domain].alloc_td.td,
+			    0, 0, "KTLS_alloc", "thr_%d", domain);
+			if (error)
+				panic("Can't add KTLS alloc thread %d error %d",
+				    domain, error);
+			CPU_COPY(&cpuset_domain[domain], &mask);
+			error = cpuset_setthread(ktls_domains[domain].alloc_td.td->td_tid,
+			    &mask);
+			if (error)
+				panic("Unable to bind KTLS alloc %d error %d",
+				    domain, error);
+
+			}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
@@ -1946,6 +1977,7 @@
 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
 {
 	void *buf;
+	int domain;
 
 	if (m->m_epg_npgs <= 2)
 		return (NULL);
@@ -1961,8 +1993,20 @@
 		return (NULL);
 	}
 	buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
-	if (buf == NULL)
+	if (buf == NULL) {
+		domain = PCPU_GET(domain);
 		wq->lastallocfail = ticks;
+
+		/*
+		 * Note that this check is "racy", but the races are
+		 * harmless, and are either a spurious wakeup if
+		 * multiple threads fail allocations before the alloc
+		 * thread wakes, or waiting an extra second in case we
+		 * see an old value of running == true.
+		 */
+		if (ktls_domains[domain].alloc_td.running == false)
+			wakeup(&ktls_domains[domain].alloc_td);
+	}
 	return (buf);
 }
 
@@ -2154,6 +2198,56 @@
 	CURVNET_RESTORE();
 }
 
+static void
+ktls_alloc_thread(void *ctx)
+{
+	struct ktls_domain_info *ktls_domain = ctx;
+	struct ktls_alloc_thread *sc = &ktls_domain->alloc_td;
+	void **buf;
+	struct sysctl_oid *oid;
+	char name[80];
+	int i, nbufs;
+
+	curthread->td_domain.dr_policy =
+		DOMAINSET_PREF(PCPU_GET(domain));
+	snprintf(name, sizeof(name), "domain%d", PCPU_GET(domain));
+
+
+	printf("Starting KTLS alloc thread for domain %d\n",
+	    PCPU_GET(domain));
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO,
+	    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
+	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs",
+	    CTLFLAG_RD,  &sc->allocs, 0, "buffers allocated");
+	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups",
+	    CTLFLAG_RD,  &sc->wakeups, 0, "thread wakeups");
+	SYSCTL_ADD_BOOL(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running",
+	    CTLFLAG_RD,  &sc->running, 0, "thread running");
+
+	buf = NULL;
+	nbufs = 0;
+	for (;;) {
+		sc->running = false;
+		tsleep(sc, PZERO, "waiting for work", 0);
+		sc->wakeups++;
+		sc->running = true;
+		if (nbufs != ktls_max_alloc) {
+			free(buf, M_KTLS);
+			nbufs = ktls_max_alloc;
+			buf = malloc(sizeof(void *) * nbufs, M_KTLS,
+			    M_WAITOK | M_ZERO);
+		}
+		for (i = 0; i < nbufs; i++) {
+			buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK);
+			sc->allocs++;
+		}
+		for (i = 0; i < nbufs; i++) {
+			uma_zfree(ktls_buffer_zone, buf[i]);
+			buf[i] = NULL;
+		}
+	}
+}
+
 static void
 ktls_work_thread(void *ctx)
 {