Support any of rdtscp and/or rdpid that are supported by Intel-based hosts
that support the "enable RDTSCP" VM-execution control.
Details
- Reviewers
chuck jhb grehan - Group Reviewers
bhyve - Commits
- rS364340: Support guest rdtscp and rdpid instructions on Intel VT-x
I've only lightly tested this so far and have only been able to test rdtscp
since I don't currently have Intel HW recent enough to support rdpid. I also
don't currently have AMD HW so I wasn't yet able to sanity-check that, on AMD,
these changes have no effect and calls to the libvmmapi
vm_{get,set}_capability() API have the desired outcome (returning the expected
errno values).
That said, in light of the recent request for this feature on
freebsd-virtualization, I thought it'd still be useful to post this with what testing
I've gotten to so far.
My simple test has just been running a 4-CPU RELEASE guest on a patched 4-CPU
CURRENT host with the guest's vCPUs pinned to the host's CPUs in opposing CPU
numbering order, and then running the below simple test program on both the host
and the guest (usually via gnu-watch). (The script I used to launch this guest
is also below).
Since, in this scenario, both the host and the guest are writing
PCPU_GET(cpuid) to each TSC_AUX register, I figure the reverse pinning should
ensure that the guest writes a different value to all of its TSC_AUX registers.
This seemed like a simple way to confirm that the guest and host do, indeed,
effectively have their own TSC_AUX registers---guest values do show up on
the guest, guest values don't show up on the host, and vice versa from the
host's perspective.
I also made the following one-off change to the host's initcpu.c to give each
host TSC a visibly different value so that I could eyeball that the guest's
vCPUs were, indeed, running on host CPUs with different TSC_AUX values:
@@ -284,6 +284,8 @@ initializecpu(void) if ((amd_feature & AMDID_RDTSCP) != 0 || (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0) wrmsr(MSR_TSC_AUX, PCPU_GET(cpuid)); + + wrmsr(MSR_TSC, ((uint64_t)PCPU_GET(cpuid)) << 48); }
(Many thanks to @grehan for his help with this work)!
#include <sys/types.h> #include <sys/sysctl.h> #include <assert.h> #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <strings.h> #include <pthread.h> #include <pthread_np.h> struct testthrcb { pthread_t thr; #ifdef HAVE_RDPID uint64_t aux_rdpid; #endif uint32_t aux_rdtscp; uint64_t tsc_rdtsc; uint64_t tsc_rdtscp; }; static bool testthrs_enabled; static pthread_mutex_t testthrs_enabled_lock; static pthread_cond_t testthrs_enabled_cond; static void *run_testthr(void *arg); #ifdef HAVE_RDPID static inline void rdpid(uint64_t *pid) { asm volatile ("rdpid %0" : "=r" (*pid)); } #endif static inline void rdtsc(uint64_t *tsc) { uint32_t *tsc_high, *tsc_low; tsc_high = ((uint32_t *)tsc) + 1; tsc_low = (uint32_t *)tsc; asm volatile ("rdtsc" : "=a" (*tsc_low), "=d" (*tsc_high)); } static inline void rdtscp(uint64_t *tsc, uint32_t *pid) { uint32_t *tsc_high, *tsc_low; tsc_high = ((uint32_t *)tsc) + 1; tsc_low = (uint32_t *)tsc; asm volatile ("rdtscp" : "=a" (*tsc_low), "=d" (*tsc_high), "=c" (*pid)); } static void * run_testthr(void *arg) { struct testthrcb *tcb; int err; err = pthread_mutex_lock(&testthrs_enabled_lock); assert(err == 0); while (!testthrs_enabled) { err = pthread_cond_wait(&testthrs_enabled_cond, &testthrs_enabled_lock); assert(err == 0); } err = pthread_mutex_unlock(&testthrs_enabled_lock); assert(err == 0); tcb = (struct testthrcb *)arg; #ifdef HAVE_RDPID rdpid(&tcb->aux_rdpid); #endif rdtscp(&tcb->tsc_rdtscp, &tcb->aux_rdtscp); rdtsc(&tcb->tsc_rdtsc); asm volatile ("mfence"); return (NULL); } int main(int argc, const char **argv) { pthread_attr_t attr; cpuset_t cpuset; struct testthrcb *tcbs; size_t len; int err, i, ncpus; testthrs_enabled = false; err = pthread_mutex_init(&testthrs_enabled_lock, NULL); assert(err == 0); err = pthread_cond_init(&testthrs_enabled_cond, NULL); assert(err == 0); len = sizeof(ncpus); err = sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0); assert(err == 0); tcbs = (struct testthrcb *)malloc(ncpus * sizeof(*tcbs)); assert(tcbs != NULL); bzero(tcbs, ncpus * sizeof(*tcbs)); for (i = 0; i < ncpus; i++) { err = pthread_attr_init(&attr); assert(err == 0); CPU_ZERO(&cpuset); CPU_SET(i, &cpuset); err = pthread_attr_setaffinity_np(&attr, sizeof(cpuset), &cpuset); assert(err == 0); err = pthread_create(&tcbs[i].thr, &attr, run_testthr, &tcbs[i]); assert(err == 0); err = pthread_attr_destroy(&attr); assert(err == 0); } err = pthread_mutex_lock(&testthrs_enabled_lock); assert(err == 0); testthrs_enabled = true; err = pthread_cond_broadcast(&testthrs_enabled_cond); assert(err == 0); err = pthread_mutex_unlock(&testthrs_enabled_lock); assert(err == 0); for (i = 0; i < ncpus; i++) { err = pthread_join(tcbs[i].thr, NULL); assert(err == 0); } for (i = 0; i < ncpus; i++) { #ifdef HAVE_RDPID printf("CPU %d: aux_rdpid %ju, aux_rdtscp %u, tsc_rdtscp " "%#0jx, tsc_rdtsc %#0jx, delta %jd\n", i, tcbs[i].aux_rdpid, tcbs[i].aux_rdtscp, tcbs[i].tsc_rdtscp, tcbs[i].tsc_rdtsc, (int64_t)(tcbs[i].tsc_rdtsc - tcbs[i].tsc_rdtscp)); #else printf("CPU %d: aux %u, tsc_rdtscp %#0jx, tsc_rdtsc " "%#0jx, delta %jd\n", i, tcbs[i].aux_rdtscp, tcbs[i].tsc_rdtscp, tcbs[i].tsc_rdtsc, (int64_t)(tcbs[i].tsc_rdtsc - tcbs[i].tsc_rdtscp)); #endif } printf("\n"); for (i = 0; i < ncpus; i++) { #ifdef HAVE_RDPID printf("CPU %d: aux_rdpid %ju, aux_rdtscp %u, tsc_rdtscp " "%ju, tsc_rdtsc %ju, delta %jd\n", i, tcbs[i].aux_rdpid, tcbs[i].aux_rdtscp, tcbs[i].tsc_rdtscp, tcbs[i].tsc_rdtsc, (int64_t)(tcbs[i].tsc_rdtsc - tcbs[i].tsc_rdtscp)); #else printf("CPU %d: aux %u, tsc_rdtscp %ju, tsc_rdtsc %ju, " "delta %jd\n", i, tcbs[i].aux_rdtscp, tcbs[i].tsc_rdtscp, tcbs[i].tsc_rdtsc, (int64_t)(tcbs[i].tsc_rdtsc - tcbs[i].tsc_rdtscp)); #endif } free(tcbs); err = pthread_cond_destroy(&testthrs_enabled_cond); assert(err == 0); err = pthread_mutex_destroy(&testthrs_enabled_lock); assert(err == 0); return (0); }
#!/bin/sh # References: # - 'bhyve(8)' # - 'bhyvectl(8)' # - '/usr/share/examples/bhyve/vmrun.sh' VMNAME=rdtscp NCPUS=1 RAMSIZE=128M DISKIMAGE="${HOME}/rdtscp.raw" TAP_DEV=tap0 TAP_MAC="00:a0:98:df:46:01" CONS_DEV=stdio sudo bhyvectl --vm=${VMNAME} --destroy > /dev/null 2>&1 sudo bhyveload \ -c ${CONS_DEV} \ -m ${RAMSIZE} \ -d ${DISKIMAGE} \ ${VMNAME} sudo bhyve \ -A \ -H \ -P \ -c 4 \ -p 0:3 \ -p 1:2 \ -p 2:1 \ -p 3:0 \ -m ${RAMSIZE} \ -s 0,hostbridge \ -s 1,lpc \ -s 2,virtio-blk,${DISKIMAGE} \ -s 3,virtio-net,${TAP_DEV},mac=${TAP_MAC} \ -l com1,${CONS_DEV} \ ${VMNAME} BHYVE_EXIT=$? echo -n "bhyve exited: " case ${BHYVE_EXIT} in 0) echo "reboot (0)" ;; 1) echo "powered off (1)" ;; 2) echo "halted (2)" ;; 3) echo "TRIPLE-FAULT (3)" ;; 4) echo "an error occurred (4)" ;; *) echo "UNKNOWN REASON: ${BHYVE_EXIT}" ;; esac case ${BHYVE_EXIT} in 0|1|2) # Cleanup /dev/vmm entry when bhyve did not exit # due to an error. echo -e Destroying VM... sudo bhyvectl --vm=${VMNAME} --destroy > /dev/null 2>&1 echo done ;; esac exit ${BHYVE_EXIT}
Diff Detail
- Repository
- rS FreeBSD src repository - subversion
- Lint
Lint Not Applicable - Unit
Tests Not Applicable
Event Timeline
We've successfully tested a slightly modified version this patch with a CentOS-based VM that used RDTSCP on 12.1-RELEASE. Without the patch, the appliance image core dumps. With this patch, it runs correctly. Checked that module unload works. Thank you for fixing this!
sys/amd64/vmm/intel/vmx.c | ||
---|---|---|
3091 ↗ | (On Diff #75594) | I'm still undecided as to whether the debug register save/restore should be closer to the enter_guest call than the tsx_aux save/restore. jhb, any thoughts ? |
In general looks ok to me.
sys/amd64/vmm/intel/vmx.c | ||
---|---|---|
3091 ↗ | (On Diff #75594) |
Hmmm, I think the only argument for doing it before debug is that it means in theory you could use DDB to single step through vmx_guest_enter_tsc_aux(). In practice I'm not sure it really matters all that much? I think though I might lean towards doing the tsc_aux save/restore "outside" of the debug registers. I would perhaps even do the save before vmx_run_trace(). |
I agree btw with not having these be per-vCPU capabilities that can be toggled. It would be really surprising to a guest for rdtscp to transition from "working" to "not-working". OS's expect rdtscp to either work or not work for the duration of a given boot.
As a point of reference, in illumos bhyve, we're moving capabilities like that to being set per-VM rather than per-vCPU and in many cases, making their value fixed as of VM creation time.