Page MenuHomeFreeBSD

D44567.diff
No OneTemporary

D44567.diff

diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h
--- a/usr.sbin/bhyve/acpi.h
+++ b/usr.sbin/bhyve/acpi.h
@@ -56,6 +56,7 @@
int acpi_build(struct vmctx *ctx, int ncpu);
void acpi_raise_gpe(struct vmctx *ctx, unsigned bit);
int acpi_tables_add_device(const struct acpi_device *const dev);
+int acpi_add_vcpu_affinity(int vcpuid, int domain);
void dsdt_line(const char *fmt, ...);
void dsdt_fixed_ioport(uint16_t iobase, uint16_t length);
void dsdt_fixed_irq(uint8_t irq);
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
--- a/usr.sbin/bhyve/acpi.c
+++ b/usr.sbin/bhyve/acpi.c
@@ -37,9 +37,12 @@
*/
#include <sys/param.h>
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
#include <sys/endian.h>
#include <sys/errno.h>
#include <sys/stat.h>
+#include <sys/tree.h>
#include <err.h>
#include <paths.h>
@@ -50,7 +53,9 @@
#include <string.h>
#include <unistd.h>
+#include <dev/vmm/vmm_mem.h>
#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
#include <vmmapi.h>
#include "bhyverun.h"
@@ -78,6 +83,22 @@
static char basl_template[MAXPATHLEN];
static char basl_stemplate[MAXPATHLEN];
+/*
+ * SRAT vCPU affinity info.
+ */
+struct acpi_vcpu_affinity_entry {
+ RB_ENTRY(acpi_vcpu_affinity_entry) entry;
+ int vcpuid;
+ int domain;
+};
+
+static int vcpu_affinity_cmp(struct acpi_vcpu_affinity_entry *const a1,
+ struct acpi_vcpu_affinity_entry *const a2);
+static RB_HEAD(vcpu_affinities,
+ acpi_vcpu_affinity_entry) aff_head = RB_INITIALIZER(&aff_head);
+RB_GENERATE_STATIC(vcpu_affinities, acpi_vcpu_affinity_entry, entry,
+ vcpu_affinity_cmp);
+
/*
* State for dsdt_line(), dsdt_indent(), and dsdt_unindent().
*/
@@ -121,6 +142,31 @@
return (0);
}
+static int
+vcpu_affinity_cmp(struct acpi_vcpu_affinity_entry *a1,
+ struct acpi_vcpu_affinity_entry *a2)
+{
+ return (a1->vcpuid < a2->vcpuid ? -1 : a1->vcpuid > a2->vcpuid);
+}
+
+int
+acpi_add_vcpu_affinity(int vcpuid, int domain)
+{
+ struct acpi_vcpu_affinity_entry *entry = calloc(1, sizeof(*entry));
+ if (entry == NULL) {
+ return (ENOMEM);
+ }
+
+ entry->vcpuid = vcpuid;
+ entry->domain = domain;
+ if (RB_INSERT(vcpu_affinities, &aff_head, entry) != NULL) {
+ free(entry);
+ return (EEXIST);
+ }
+
+ return (0);
+}
+
/*
* Helper routines for writing to the DSDT from other modules.
*/
@@ -726,6 +772,83 @@
return (0);
}
+static int
+build_srat(struct vmctx *const ctx)
+{
+ ACPI_TABLE_SRAT srat;
+ ACPI_SRAT_MEM_AFFINITY srat_mem_affinity;
+ ACPI_SRAT_CPU_AFFINITY srat_cpu_affinity;
+
+ struct acpi_vcpu_affinity_entry *ep;
+ struct basl_table *table;
+ int segid, domain;
+ int _flags, _prot;
+ vm_ooffset_t _off;
+ size_t maplen;
+ uint64_t gpa;
+ int ret;
+
+ if (RB_EMPTY(&aff_head))
+ return (0);
+
+ memset(&srat, 0, sizeof(srat));
+ BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_SRAT,
+ BASL_TABLE_ALIGNMENT));
+ BASL_EXEC(basl_table_append_header(table, ACPI_SIG_SRAT, 1, 1));
+ srat.TableRevision = 1;
+ BASL_EXEC(basl_table_append_content(table, &srat, sizeof(srat)));
+
+ /*
+ * Iterate over the VM's memory maps and add
+ * a 'Memory Affinity Structure' for each mapping.
+ */
+ gpa = 0;
+ while (1) {
+ ret = vm_mmap_getnext(ctx, &gpa, &segid, &_off, &maplen, &_prot,
+ &_flags);
+ if (ret) {
+ break;
+ }
+
+ if (segid >= VM_SYSMEM && segid < VM_BOOTROM) {
+ domain = segid - VM_SYSMEM;
+ } else {
+ /* Treat devmem segs as domain 0. */
+ domain = 0;
+ }
+ memset(&srat_mem_affinity, 0, sizeof(srat_mem_affinity));
+ srat_mem_affinity.Header.Type = ACPI_SRAT_TYPE_MEMORY_AFFINITY;
+ srat_mem_affinity.Header.Length = sizeof(srat_mem_affinity);
+ srat_mem_affinity.Flags |= ACPI_SRAT_MEM_ENABLED;
+ srat_mem_affinity.ProximityDomain = htole32(domain);
+ srat_mem_affinity.BaseAddress = htole64(gpa);
+ srat_mem_affinity.Length = htole64(maplen);
+ srat_mem_affinity.Flags = htole32(ACPI_SRAT_MEM_ENABLED);
+ BASL_EXEC(basl_table_append_bytes(table, &srat_mem_affinity,
+ sizeof(srat_mem_affinity)));
+ gpa += maplen;
+ }
+
+ /*
+ * Iterate over each "vCPUid to domain id" mapping and emit a
+ * 'Processor Local APIC/SAPIC Affinity Structure' for each entry.
+ */
+ RB_FOREACH(ep, vcpu_affinities, &aff_head) {
+ memset(&srat_cpu_affinity, 0, sizeof(srat_cpu_affinity));
+ srat_cpu_affinity.Header.Type = ACPI_SRAT_TYPE_CPU_AFFINITY;
+ srat_cpu_affinity.Header.Length = sizeof(srat_cpu_affinity);
+ srat_cpu_affinity.ProximityDomainLo = (uint8_t)ep->domain;
+ srat_cpu_affinity.ApicId = (uint8_t)ep->vcpuid;
+ srat_cpu_affinity.Flags = htole32(ACPI_SRAT_CPU_USE_AFFINITY);
+ BASL_EXEC(basl_table_append_bytes(table, &srat_cpu_affinity,
+ sizeof(srat_cpu_affinity)));
+ }
+
+ BASL_EXEC(basl_table_register_to_rsdt(table));
+
+ return (0);
+}
+
int
acpi_build(struct vmctx *ctx, int ncpu)
{
@@ -765,6 +888,7 @@
BASL_EXEC(build_mcfg(ctx));
BASL_EXEC(build_facs(ctx));
BASL_EXEC(build_spcr(ctx));
+ BASL_EXEC(build_srat(ctx));
/* Build ACPI device-specific tables such as a TPM2 table. */
const struct acpi_device_list_entry *entry;
diff --git a/usr.sbin/bhyve/amd64/bhyverun_machdep.c b/usr.sbin/bhyve/amd64/bhyverun_machdep.c
--- a/usr.sbin/bhyve/amd64/bhyverun_machdep.c
+++ b/usr.sbin/bhyve/amd64/bhyverun_machdep.c
@@ -91,6 +91,7 @@
" -K: PS2 keyboard layout\n"
" -l: LPC device configuration\n"
" -m: memory size\n"
+ " -n: NUMA domain specification\n"
" -o: set config 'var' to 'value'\n"
" -P: vmexit from the guest on pause\n"
" -p: pin 'vcpu' to 'hostcpu'\n"
@@ -117,9 +118,9 @@
int c;
#ifdef BHYVE_SNAPSHOT
- optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:";
+ optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:r:";
#else
- optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:";
+ optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:";
#endif
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
@@ -194,6 +195,15 @@
case 'm':
set_config_value("memory.size", optarg);
break;
+ case 'n':
+ if (bhyve_numa_parse(optarg) != 0)
+ errx(EX_USAGE,
+ "invalid NUMA configuration "
+ "'%s'",
+ optarg);
+ if (!get_config_bool("acpi_tables"))
+ errx(EX_USAGE, "NUMA emulation requires ACPI");
+ break;
case 'o':
if (!bhyve_parse_config_option(optarg)) {
errx(EX_USAGE,
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -269,8 +269,56 @@
(either upper or lower case)
to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes.
If no suffix is given, the value is assumed to be in megabytes.
-.Pp
The default is 256M.
+.Pp
+.It Fl n Ar id Ns Cm \&, Ns Ar size Ns Cm \&, Ns Ar cpus Ns Op Cm \&, Ns Ar domain_policy
+Configure guest NUMA domains.
+This option applies only to the amd64 platform.
+.Pp
+The
+.Fl n
+option allows the guest physical address space to be partitioned into domains.
+The layout of each domain is encoded in an ACPI table
+visible to the guest operating system.
+The
+.Fl n
+option also allows the specification of a
+.Xr domainset 9
+memory allocation policy for the host memory backing a given NUMA domain.
+A guest can have up to 8 NUMA domains.
+This feature requires that the guest use a boot ROM, and in
+particular cannot be used if the guest was initialized using
+.Xr bhyveload 8 .
+.Pp
+Each domain is identified by a numerical
+.Em id .
+The domain memory
+.Em size
+is specified using the same format as the
+.Fl m
+flag.
+The sum of all
+.Em size
+parameters overrides the total VM memory size specified by the
+.Fl m
+flag.
+However, if at least one domain memory size parameter is
+missing, the total VM memory size will be equally distributed across
+all emulated domains.
+The
+.Em cpuset
+parameter specifies the set of CPUs that are part of the domain.
+The
+.Em domain_policy
+parameter may be optionally used to configure the
+.Xr domainset 9
+host NUMA memory allocation policy for an emulated
+domain.
+See the
+.Ar -n
+flag in
+.Xr cpuset 1
+for a list of valid NUMA memory allocation policies and their formats.
.It Fl o Ar var Ns Cm = Ns Ar value
Set the configuration variable
.Ar var
@@ -1202,6 +1250,33 @@
.Bd -literal -offset indent
/usr/sbin/bhyve -k configfile vm0
.Ed
+.Pp
+Run a UEFI virtual machine with four CPUs and two emulated NUMA domains:
+.Bd -literal -offset indent
+bhyve -c 4 -w -H \\
+ -s 0,hostbridge \\
+ -s 4,ahci-hd,disk.img \\
+ -s 31,lpc -l com1,stdio \\
+ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
+ -n id=0,size=4G,cpus=0-1 \\
+ -n id=1,size=4G,cpus=2-3 \\
+ numavm
+.Ed
+.Pp
+Assuming a host machine with two NUMA domains,
+run a UEFI virtual machine with four CPUs using a
+.Ar prefer
+.Xr domainset 9
+policy to allocate guest memory from the first host NUMA domain only.
+.Bd -literal -offset indent
+bhyve -c 2 -w -H \\
+ -s 0,hostbridge \\
+ -s 4,ahci-hd,disk.img \\
+ -s 31,lpc -l com1,stdio \\
+ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
+ -n id=0,size=4G,cpus=0-1,domain_policy=prefer:0 \\
+ numavm
+.Ed
.Sh SEE ALSO
.Xr bhyve 4 ,
.Xr netgraph 4 ,
@@ -1211,7 +1286,8 @@
.Xr bhyve_config 5 ,
.Xr ethers 5 ,
.Xr bhyvectl 8 ,
-.Xr bhyveload 8
+.Xr bhyveload 8 ,
+.Xr domainset 9
.Pp
.Rs
.%A Intel
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -73,6 +73,7 @@
#endif
int bhyve_pincpu_parse(const char *opt);
int bhyve_topology_parse(const char *opt);
+int bhyve_numa_parse(const char *opt);
void bhyve_init_vcpu(struct vcpu *vcpu);
void bhyve_start_vcpu(struct vcpu *vcpu, bool bsp);
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -30,6 +30,8 @@
#ifndef WITHOUT_CAPSICUM
#include <sys/capsicum.h>
#endif
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
#include <sys/mman.h>
#ifdef BHYVE_SNAPSHOT
#include <sys/socket.h>
@@ -54,6 +56,7 @@
#include <fcntl.h>
#endif
#include <libgen.h>
+#include <libutil.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
@@ -68,6 +71,7 @@
#include <libxo/xo.h>
#endif
+#include <dev/vmm/vmm_mem.h>
#include <vmmapi.h>
#include "acpi.h"
@@ -108,6 +112,9 @@
static cpuset_t cpumask;
+static struct vm_mem_domain guest_domains[VM_MAXMEMDOM];
+static int guest_ndomains = 0;
+
static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu);
static struct vcpu_info {
@@ -179,6 +186,118 @@
return (lval);
}
+int
+bhyve_numa_parse(const char *opt)
+{
+ int id = -1;
+ nvlist_t *nvl;
+ char *cp, *str, *tofree;
+ char pathbuf[64] = { 0 };
+ char *size = NULL, *cpus = NULL, *domain_policy = NULL;
+
+ if (*opt == '\0') {
+ return (-1);
+ }
+
+ tofree = str = strdup(opt);
+ if (str == NULL)
+ errx(4, "Failed to allocate memory");
+
+ while ((cp = strsep(&str, ",")) != NULL) {
+ if (strncmp(cp, "id=", strlen("id=")) == 0)
+ id = parse_int_value("id", cp + strlen("id="), 0,
+ UINT8_MAX);
+ else if (strncmp(cp, "size=", strlen("size=")) == 0)
+ size = cp + strlen("size=");
+ else if (strncmp(cp,
+ "domain_policy=", strlen("domain_policy=")) == 0)
+ domain_policy = cp + strlen("domain_policy=");
+ else if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
+ cpus = cp + strlen("cpus=");
+ }
+
+ if (id == -1) {
+ EPRINTLN("Missing NUMA domain ID in '%s'", opt);
+ goto out;
+ }
+
+ snprintf(pathbuf, sizeof(pathbuf), "domains.%d", id);
+ nvl = find_config_node(pathbuf);
+ if (nvl == NULL)
+ nvl = create_config_node(pathbuf);
+ if (size != NULL)
+ set_config_value_node(nvl, "size", size);
+ if (domain_policy != NULL)
+ set_config_value_node(nvl, "domain_policy", domain_policy);
+ if (cpus != NULL)
+ set_config_value_node(nvl, "cpus", cpus);
+
+ free(tofree);
+ return (0);
+
+out:
+ free(tofree);
+ return (-1);
+}
+
+static void
+calc_mem_affinity(size_t vm_memsize)
+{
+ int i;
+ nvlist_t *nvl;
+ bool need_recalc;
+ const char *value;
+ struct vm_mem_domain *dom;
+ char pathbuf[64] = { 0 };
+
+ need_recalc = false;
+ for (i = 0; i < VM_MAXMEMDOM; i++) {
+ dom = &guest_domains[i];
+ snprintf(pathbuf, sizeof(pathbuf), "domains.%d", i);
+ nvl = find_config_node(pathbuf);
+ if (nvl == NULL) {
+ break;
+ }
+
+ value = get_config_value_node(nvl, "size");
+ need_recalc |= value == NULL;
+ if (value != NULL && vm_parse_memsize(value, &dom->size)) {
+ errx(EX_USAGE, "invalid memsize for domain %d: '%s'", i,
+ value);
+ }
+
+ dom->ds_mask = calloc(1, sizeof(domainset_t));
+ if (dom->ds_mask == NULL) {
+ errx(EX_OSERR, "Failed to allocate domainset mask");
+ }
+ dom->ds_size = sizeof(domainset_t);
+ value = get_config_value_node(nvl, "domain_policy");
+ if (value == NULL) {
+ dom->ds_policy = DOMAINSET_POLICY_INVALID;
+ DOMAINSET_ZERO(dom->ds_mask);
+ } else if (domainset_parselist(value, dom->ds_mask, &dom->ds_policy) !=
+ CPUSET_PARSE_OK) {
+ errx(EX_USAGE, "failed to parse domain policy '%s'", value);
+ }
+ }
+
+ guest_ndomains = i;
+ if (guest_ndomains == 0) {
+ /*
+ * No domains were specified - create domain
+ * 0 holding all CPUs and memory.
+ */
+ guest_ndomains = 1;
+ guest_domains[0].size = vm_memsize;
+ } else if (need_recalc) {
+ warnx("At least one domain memory size was not specified, distributing"
+ " total VM memory size across all domains");
+ for (i = 0; i < guest_ndomains; i++) {
+ guest_domains[i].size = vm_memsize / guest_ndomains;
+ }
+ }
+}
+
/*
* Set the sockets, cores, threads, and guest_cpus variables based on
* the configured topology.
@@ -340,6 +459,56 @@
}
}
+static void
+set_vcpu_affinities(void)
+{
+ int cpu, error;
+ nvlist_t *nvl = NULL;
+ cpuset_t cpus;
+ const char *value;
+ char pathbuf[64] = { 0 };
+
+ for (int dom = 0; dom < guest_ndomains; dom++) {
+ snprintf(pathbuf, sizeof(pathbuf), "domains.%d", dom);
+ nvl = find_config_node(pathbuf);
+ if (nvl == NULL)
+ break;
+
+ value = get_config_value_node(nvl, "cpus");
+ if (value == NULL) {
+ EPRINTLN("Missing CPU set for domain %d", dom);
+ exit(4);
+ }
+
+ parse_cpuset(dom, value, &cpus);
+ CPU_FOREACH_ISSET(cpu, &cpus) {
+ error = acpi_add_vcpu_affinity(cpu, dom);
+ if (error) {
+ EPRINTLN(
+ "Unable to set vCPU %d affinity for domain %d: %s",
+ cpu, dom, strerror(errno));
+ exit(4);
+ }
+ }
+ }
+ if (guest_ndomains > 1 || nvl != NULL)
+ return;
+
+ /*
+ * If we're dealing with one domain and no cpuset was provided, create a
+ * default one holding all cpus.
+ */
+ for (cpu = 0; cpu < guest_ncpus; cpu++) {
+ error = acpi_add_vcpu_affinity(cpu, 0);
+ if (error) {
+ EPRINTLN(
+ "Unable to set vCPU %d affinity for domain %d: %s",
+ cpu, 0, strerror(errno));
+ exit(4);
+ }
+ }
+}
+
void *
paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
{
@@ -713,18 +882,21 @@
vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid);
}
+ calc_mem_affinity(memsize);
memflags = 0;
if (get_config_bool_default("memory.wired", false))
memflags |= VM_MEM_F_WIRED;
if (get_config_bool_default("memory.guest_in_core", false))
memflags |= VM_MEM_F_INCORE;
vm_set_memflags(ctx, memflags);
- error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+ error = vm_setup_memory_domains(ctx, VM_MMAP_ALL, guest_domains,
+ guest_ndomains);
if (error) {
fprintf(stderr, "Unable to setup memory (%d)\n", errno);
exit(4);
}
+ set_vcpu_affinities();
init_mem(guest_ncpus);
init_bootrom(ctx);
if (bhyve_init_platform(ctx, bsp) != 0)
diff --git a/usr.sbin/bhyve/bootrom.c b/usr.sbin/bhyve/bootrom.c
--- a/usr.sbin/bhyve/bootrom.c
+++ b/usr.sbin/bhyve/bootrom.c
@@ -31,6 +31,7 @@
#include <sys/mman.h>
#include <sys/stat.h>
+#include <dev/vmm/vmm_mem.h>
#include <machine/vmm.h>
#include <err.h>
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -42,6 +42,7 @@
#include <stdbool.h>
#include <sysexits.h>
+#include <dev/vmm/vmm_mem.h>
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c
--- a/usr.sbin/bhyve/pci_fbuf.c
+++ b/usr.sbin/bhyve/pci_fbuf.c
@@ -29,6 +29,7 @@
#include <sys/types.h>
#include <sys/mman.h>
+#include <dev/vmm/vmm_mem.h>
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
--- a/usr.sbin/bhyve/pci_passthru.c
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -38,6 +38,7 @@
#include <dev/io/iodev.h>
#include <dev/pci/pcireg.h>
+#include <dev/vmm/vmm_mem.h>
#include <vm/vm.h>

File Metadata

Mime Type
text/plain
Expires
Sun, Jan 18, 4:29 AM (11 h, 45 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27704491
Default Alt Text
D44567.diff (16 KB)

Event Timeline