Page MenuHomeFreeBSD
Authored By
gusev.vitaliy_gmail.com
Dec 17 2020, 4:00 PM
Size
217 KB
Referenced Files
None
Subscribers
None

multiple-devices-v1.patch

commit 3b241811a0b0bc3da140b321cc1a53f850535d40
Author: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Date: Thu Nov 19 12:07:18 2020 +0300
bhyve multiple devices
diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c
index a08f58f84b22..fea0b1386fe2 100644
--- a/usr.sbin/bhyve/atkbdc.c
+++ b/usr.sbin/bhyve/atkbdc.c
@@ -1,632 +1,638 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2015 Nahanni Systems Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <pthread_np.h>
#include "acpi.h"
#include "atkbdc.h"
#include "inout.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
#include "ps2kbd.h"
#include "ps2mouse.h"
+#ifdef BHYVE_SNAPSHOT
+#include "snapshot.h"
+#endif
+
#define KBD_DATA_PORT 0x60
#define KBD_STS_CTL_PORT 0x64
#define KBDC_RESET 0xfe
#define KBD_DEV_IRQ 1
#define AUX_DEV_IRQ 12
/* controller commands */
#define KBDC_SET_COMMAND_BYTE 0x60
#define KBDC_GET_COMMAND_BYTE 0x20
#define KBDC_DISABLE_AUX_PORT 0xa7
#define KBDC_ENABLE_AUX_PORT 0xa8
#define KBDC_TEST_AUX_PORT 0xa9
#define KBDC_TEST_CTRL 0xaa
#define KBDC_TEST_KBD_PORT 0xab
#define KBDC_DISABLE_KBD_PORT 0xad
#define KBDC_ENABLE_KBD_PORT 0xae
#define KBDC_READ_INPORT 0xc0
#define KBDC_READ_OUTPORT 0xd0
#define KBDC_WRITE_OUTPORT 0xd1
#define KBDC_WRITE_KBD_OUTBUF 0xd2
#define KBDC_WRITE_AUX_OUTBUF 0xd3
#define KBDC_WRITE_TO_AUX 0xd4
/* controller command byte (set by KBDC_SET_COMMAND_BYTE) */
#define KBD_TRANSLATION 0x40
#define KBD_SYS_FLAG_BIT 0x04
#define KBD_DISABLE_KBD_PORT 0x10
#define KBD_DISABLE_AUX_PORT 0x20
#define KBD_ENABLE_AUX_INT 0x02
#define KBD_ENABLE_KBD_INT 0x01
#define KBD_KBD_CONTROL_BITS (KBD_DISABLE_KBD_PORT | KBD_ENABLE_KBD_INT)
#define KBD_AUX_CONTROL_BITS (KBD_DISABLE_AUX_PORT | KBD_ENABLE_AUX_INT)
/* controller status bits */
#define KBDS_KBD_BUFFER_FULL 0x01
#define KBDS_SYS_FLAG 0x04
#define KBDS_CTRL_FLAG 0x08
#define KBDS_AUX_BUFFER_FULL 0x20
/* controller output port */
#define KBDO_KBD_OUTFULL 0x10
#define KBDO_AUX_OUTFULL 0x20
#define RAMSZ 32
#define FIFOSZ 15
#define CTRL_CMD_FLAG 0x8000
struct kbd_dev {
bool irq_active;
int irq;
uint8_t buffer[FIFOSZ];
int brd, bwr;
int bcnt;
};
struct aux_dev {
bool irq_active;
int irq;
};
struct atkbdc_softc {
struct vmctx *ctx;
pthread_mutex_t mtx;
struct ps2kbd_softc *ps2kbd_sc;
struct ps2mouse_softc *ps2mouse_sc;
uint8_t status; /* status register */
uint8_t outport; /* controller output port */
uint8_t ram[RAMSZ]; /* byte0 = controller config */
uint32_t curcmd; /* current command for next byte */
uint32_t ctrlbyte;
struct kbd_dev kbd;
struct aux_dev aux;
};
-#ifdef BHYVE_SNAPSHOT
-static struct atkbdc_softc *atkbdc_sc = NULL;
-#endif
-
static void
atkbdc_assert_kbd_intr(struct atkbdc_softc *sc)
{
if ((sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) {
sc->kbd.irq_active = true;
vm_isa_pulse_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
}
}
static void
atkbdc_assert_aux_intr(struct atkbdc_softc *sc)
{
if ((sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) {
sc->aux.irq_active = true;
vm_isa_pulse_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
}
}
static int
atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val)
{
assert(pthread_mutex_isowned_np(&sc->mtx));
if (sc->kbd.bcnt < FIFOSZ) {
sc->kbd.buffer[sc->kbd.bwr] = val;
sc->kbd.bwr = (sc->kbd.bwr + 1) % FIFOSZ;
sc->kbd.bcnt++;
sc->status |= KBDS_KBD_BUFFER_FULL;
sc->outport |= KBDO_KBD_OUTFULL;
} else {
printf("atkbd data buffer full\n");
}
return (sc->kbd.bcnt < FIFOSZ);
}
static void
atkbdc_kbd_read(struct atkbdc_softc *sc)
{
const uint8_t translation[256] = {
0xff, 0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3c, 0x58,
0x64, 0x44, 0x42, 0x40, 0x3e, 0x0f, 0x29, 0x59,
0x65, 0x38, 0x2a, 0x70, 0x1d, 0x10, 0x02, 0x5a,
0x66, 0x71, 0x2c, 0x1f, 0x1e, 0x11, 0x03, 0x5b,
0x67, 0x2e, 0x2d, 0x20, 0x12, 0x05, 0x04, 0x5c,
0x68, 0x39, 0x2f, 0x21, 0x14, 0x13, 0x06, 0x5d,
0x69, 0x31, 0x30, 0x23, 0x22, 0x15, 0x07, 0x5e,
0x6a, 0x72, 0x32, 0x24, 0x16, 0x08, 0x09, 0x5f,
0x6b, 0x33, 0x25, 0x17, 0x18, 0x0b, 0x0a, 0x60,
0x6c, 0x34, 0x35, 0x26, 0x27, 0x19, 0x0c, 0x61,
0x6d, 0x73, 0x28, 0x74, 0x1a, 0x0d, 0x62, 0x6e,
0x3a, 0x36, 0x1c, 0x1b, 0x75, 0x2b, 0x63, 0x76,
0x55, 0x56, 0x77, 0x78, 0x79, 0x7a, 0x0e, 0x7b,
0x7c, 0x4f, 0x7d, 0x4b, 0x47, 0x7e, 0x7f, 0x6f,
0x52, 0x53, 0x50, 0x4c, 0x4d, 0x48, 0x01, 0x45,
0x57, 0x4e, 0x51, 0x4a, 0x37, 0x49, 0x46, 0x54,
0x80, 0x81, 0x82, 0x41, 0x54, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
};
uint8_t val;
uint8_t release = 0;
assert(pthread_mutex_isowned_np(&sc->mtx));
if (sc->ram[0] & KBD_TRANSLATION) {
while (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) {
if (val == 0xf0) {
release = 0x80;
continue;
} else {
val = translation[val] | release;
}
atkbdc_kbd_queue_data(sc, val);
break;
}
} else {
while (sc->kbd.bcnt < FIFOSZ) {
if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1)
atkbdc_kbd_queue_data(sc, val);
else
break;
}
}
if (((sc->ram[0] & KBD_DISABLE_AUX_PORT) ||
ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) && sc->kbd.bcnt > 0)
atkbdc_assert_kbd_intr(sc);
}
static void
atkbdc_aux_poll(struct atkbdc_softc *sc)
{
if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) {
sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
sc->outport |= KBDO_AUX_OUTFULL;
atkbdc_assert_aux_intr(sc);
}
}
static void
atkbdc_kbd_poll(struct atkbdc_softc *sc)
{
assert(pthread_mutex_isowned_np(&sc->mtx));
atkbdc_kbd_read(sc);
}
static void
atkbdc_poll(struct atkbdc_softc *sc)
{
atkbdc_aux_poll(sc);
atkbdc_kbd_poll(sc);
}
static void
atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf)
{
assert(pthread_mutex_isowned_np(&sc->mtx));
if (ps2mouse_read(sc->ps2mouse_sc, buf) == 0) {
if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) {
if (sc->kbd.bcnt == 0)
sc->status &= ~(KBDS_AUX_BUFFER_FULL |
KBDS_KBD_BUFFER_FULL);
else
sc->status &= ~(KBDS_AUX_BUFFER_FULL);
sc->outport &= ~KBDO_AUX_OUTFULL;
}
atkbdc_poll(sc);
return;
}
if (sc->kbd.bcnt > 0) {
*buf = sc->kbd.buffer[sc->kbd.brd];
sc->kbd.brd = (sc->kbd.brd + 1) % FIFOSZ;
sc->kbd.bcnt--;
if (sc->kbd.bcnt == 0) {
sc->status &= ~KBDS_KBD_BUFFER_FULL;
sc->outport &= ~KBDO_KBD_OUTFULL;
}
atkbdc_poll(sc);
}
if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0 && sc->kbd.bcnt == 0) {
sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
}
}
static int
atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
struct atkbdc_softc *sc;
uint8_t buf;
int retval;
if (bytes != 1)
return (-1);
sc = arg;
retval = 0;
pthread_mutex_lock(&sc->mtx);
if (in) {
sc->curcmd = 0;
if (sc->ctrlbyte != 0) {
*eax = sc->ctrlbyte & 0xff;
sc->ctrlbyte = 0;
} else {
/* read device buffer; includes kbd cmd responses */
atkbdc_dequeue_data(sc, &buf);
*eax = buf;
}
sc->status &= ~KBDS_CTRL_FLAG;
pthread_mutex_unlock(&sc->mtx);
return (retval);
}
if (sc->status & KBDS_CTRL_FLAG) {
/*
* Command byte for the controller.
*/
switch (sc->curcmd) {
case KBDC_SET_COMMAND_BYTE:
sc->ram[0] = *eax;
if (sc->ram[0] & KBD_SYS_FLAG_BIT)
sc->status |= KBDS_SYS_FLAG;
else
sc->status &= ~KBDS_SYS_FLAG;
break;
case KBDC_WRITE_OUTPORT:
sc->outport = *eax;
break;
case KBDC_WRITE_TO_AUX:
ps2mouse_write(sc->ps2mouse_sc, *eax, 0);
atkbdc_poll(sc);
break;
case KBDC_WRITE_KBD_OUTBUF:
atkbdc_kbd_queue_data(sc, *eax);
break;
case KBDC_WRITE_AUX_OUTBUF:
ps2mouse_write(sc->ps2mouse_sc, *eax, 1);
sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
atkbdc_aux_poll(sc);
break;
default:
/* write to particular RAM byte */
if (sc->curcmd >= 0x61 && sc->curcmd <= 0x7f) {
int byten;
byten = (sc->curcmd - 0x60) & 0x1f;
sc->ram[byten] = *eax & 0xff;
}
break;
}
sc->curcmd = 0;
sc->status &= ~KBDS_CTRL_FLAG;
pthread_mutex_unlock(&sc->mtx);
return (retval);
}
/*
* Data byte for the device.
*/
ps2kbd_write(sc->ps2kbd_sc, *eax);
atkbdc_poll(sc);
pthread_mutex_unlock(&sc->mtx);
return (retval);
}
static int
atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg)
{
struct atkbdc_softc *sc;
int error, retval;
if (bytes != 1)
return (-1);
sc = arg;
retval = 0;
pthread_mutex_lock(&sc->mtx);
if (in) {
/* read status register */
*eax = sc->status;
pthread_mutex_unlock(&sc->mtx);
return (retval);
}
sc->curcmd = 0;
sc->status |= KBDS_CTRL_FLAG;
sc->ctrlbyte = 0;
switch (*eax) {
case KBDC_GET_COMMAND_BYTE:
sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[0];
break;
case KBDC_TEST_CTRL:
sc->ctrlbyte = CTRL_CMD_FLAG | 0x55;
break;
case KBDC_TEST_AUX_PORT:
case KBDC_TEST_KBD_PORT:
sc->ctrlbyte = CTRL_CMD_FLAG | 0;
break;
case KBDC_READ_INPORT:
sc->ctrlbyte = CTRL_CMD_FLAG | 0;
break;
case KBDC_READ_OUTPORT:
sc->ctrlbyte = CTRL_CMD_FLAG | sc->outport;
break;
case KBDC_SET_COMMAND_BYTE:
case KBDC_WRITE_OUTPORT:
case KBDC_WRITE_KBD_OUTBUF:
case KBDC_WRITE_AUX_OUTBUF:
sc->curcmd = *eax;
break;
case KBDC_DISABLE_KBD_PORT:
sc->ram[0] |= KBD_DISABLE_KBD_PORT;
break;
case KBDC_ENABLE_KBD_PORT:
sc->ram[0] &= ~KBD_DISABLE_KBD_PORT;
if (sc->kbd.bcnt > 0)
sc->status |= KBDS_KBD_BUFFER_FULL;
atkbdc_poll(sc);
break;
case KBDC_WRITE_TO_AUX:
sc->curcmd = *eax;
break;
case KBDC_DISABLE_AUX_PORT:
sc->ram[0] |= KBD_DISABLE_AUX_PORT;
ps2mouse_toggle(sc->ps2mouse_sc, 0);
sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
sc->outport &= ~KBDS_AUX_BUFFER_FULL;
break;
case KBDC_ENABLE_AUX_PORT:
sc->ram[0] &= ~KBD_DISABLE_AUX_PORT;
ps2mouse_toggle(sc->ps2mouse_sc, 1);
if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0)
sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
break;
case KBDC_RESET: /* Pulse "reset" line */
error = vm_suspend(ctx, VM_SUSPEND_RESET);
assert(error == 0 || errno == EALREADY);
break;
default:
if (*eax >= 0x21 && *eax <= 0x3f) {
/* read "byte N" from RAM */
int byten;
byten = (*eax - 0x20) & 0x1f;
sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[byten];
}
break;
}
pthread_mutex_unlock(&sc->mtx);
if (sc->ctrlbyte != 0) {
sc->status |= KBDS_KBD_BUFFER_FULL;
sc->status &= ~KBDS_AUX_BUFFER_FULL;
atkbdc_assert_kbd_intr(sc);
} else if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0 &&
(sc->ram[0] & KBD_DISABLE_AUX_PORT) == 0) {
sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
atkbdc_assert_aux_intr(sc);
} else if (sc->kbd.bcnt > 0 && (sc->ram[0] & KBD_DISABLE_KBD_PORT) == 0) {
sc->status |= KBDS_KBD_BUFFER_FULL;
atkbdc_assert_kbd_intr(sc);
}
return (retval);
}
void
atkbdc_event(struct atkbdc_softc *sc, int iskbd)
{
pthread_mutex_lock(&sc->mtx);
if (iskbd)
atkbdc_kbd_poll(sc);
else
atkbdc_aux_poll(sc);
pthread_mutex_unlock(&sc->mtx);
}
+#ifdef BHYVE_SNAPSHOT
+extern int atkbdc_snapshot(struct vm_snapshot_meta *, void *);
+
+static struct snapshot_ops atkbdc_snapshot_ops = {
+ .snapshot_cb = atkbdc_snapshot,
+};
+#endif
+
void
atkbdc_init(struct vmctx *ctx)
{
struct inout_port iop;
struct atkbdc_softc *sc;
int error;
sc = calloc(1, sizeof(struct atkbdc_softc));
sc->ctx = ctx;
pthread_mutex_init(&sc->mtx, NULL);
bzero(&iop, sizeof(struct inout_port));
iop.name = "atkdbc";
iop.port = KBD_STS_CTL_PORT;
iop.size = 1;
iop.flags = IOPORT_F_INOUT;
iop.handler = atkbdc_sts_ctl_handler;
iop.arg = sc;
error = register_inout(&iop);
assert(error == 0);
bzero(&iop, sizeof(struct inout_port));
iop.name = "atkdbc";
iop.port = KBD_DATA_PORT;
iop.size = 1;
iop.flags = IOPORT_F_INOUT;
iop.handler = atkbdc_data_handler;
iop.arg = sc;
error = register_inout(&iop);
assert(error == 0);
pci_irq_reserve(KBD_DEV_IRQ);
sc->kbd.irq = KBD_DEV_IRQ;
pci_irq_reserve(AUX_DEV_IRQ);
sc->aux.irq = AUX_DEV_IRQ;
sc->ps2kbd_sc = ps2kbd_init(sc);
sc->ps2mouse_sc = ps2mouse_init(sc);
#ifdef BHYVE_SNAPSHOT
- assert(atkbdc_sc == NULL);
- atkbdc_sc = sc;
+ register_snapshot_dev("atkbdc", &atkbdc_snapshot_ops, sc);
#endif
}
#ifdef BHYVE_SNAPSHOT
int
-atkbdc_snapshot(struct vm_snapshot_meta *meta)
+atkbdc_snapshot(struct vm_snapshot_meta *meta, void *cbdata)
{
int ret;
-
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->status, meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->outport, meta, ret, done);
- SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->ram,
- sizeof(atkbdc_sc->ram), meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->curcmd, meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->ctrlbyte, meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd, meta, ret, done);
-
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq_active, meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq, meta, ret, done);
- SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->kbd.buffer,
- sizeof(atkbdc_sc->kbd.buffer), meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.brd, meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bwr, meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bcnt, meta, ret, done);
-
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq_active, meta, ret, done);
- SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq, meta, ret, done);
-
- ret = ps2kbd_snapshot(atkbdc_sc->ps2kbd_sc, meta);
- if (ret != 0)
- goto done;
-
- ret = ps2mouse_snapshot(atkbdc_sc->ps2mouse_sc, meta);
-
+ struct atkbdc_softc *sc = (struct atkbdc_softc *)cbdata;
+
+ assert(sc != NULL);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->status, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->outport, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->ram, sizeof (sc->ram), meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->ctrlbyte, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->kbd, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->kbd.irq_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->kbd.irq, meta, ret, done);
+ SNAPSHOT_BUF_OR_LEAVE(sc->kbd.buffer,
+ sizeof (sc->kbd.buffer), meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->kbd.brd, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->kbd.bwr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->kbd.bcnt, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(sc->aux.irq_active, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(sc->aux.irq, meta, ret, done);
+
+ ret = ps2kbd_snapshot(sc->ps2kbd_sc, meta);
+ if (ret == 0)
+ ret = ps2mouse_snapshot(sc->ps2mouse_sc, meta);
done:
return (ret);
}
#endif
static void
atkbdc_dsdt(void)
{
dsdt_line("");
dsdt_line("Device (KBD)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0303\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(KBD_DATA_PORT, 1);
dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
dsdt_fixed_irq(1);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
dsdt_line("");
dsdt_line("Device (MOU)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0F13\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(KBD_DATA_PORT, 1);
dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
dsdt_fixed_irq(12);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
}
LPC_DSDT(atkbdc_dsdt);
diff --git a/usr.sbin/bhyve/atkbdc.h b/usr.sbin/bhyve/atkbdc.h
index 14c00ed9ae88..85c8a7141eb2 100644
--- a/usr.sbin/bhyve/atkbdc.h
+++ b/usr.sbin/bhyve/atkbdc.h
@@ -1,43 +1,38 @@
/*-
* Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _ATKBDC_H_
#define _ATKBDC_H_
struct atkbdc_softc;
-struct vm_snapshot_meta;
struct vmctx;
void atkbdc_init(struct vmctx *ctx);
void atkbdc_event(struct atkbdc_softc *sc, int iskbd);
-#ifdef BHYVE_SNAPSHOT
-int atkbdc_snapshot(struct vm_snapshot_meta *meta);
-#endif
-
#endif /* _ATKBDC_H_ */
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index bca5f44c2b40..20dc4e53cf13 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -348,1113 +348,1113 @@ topology_parse(const char *opt)
if (ncpus > UINT16_MAX || (ns && n != ncpus))
return (-1);
guest_ncpus = ncpus;
sockets = s;
cores = c;
threads = t;
return(0);
out:
free(str);
return (-1);
}
static int
pincpu_parse(const char *opt)
{
int vcpu, pcpu;
if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
fprintf(stderr, "invalid format: %s\n", opt);
return (-1);
}
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
vcpu, VM_MAXCPU - 1);
return (-1);
}
if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
fprintf(stderr, "hostcpu '%d' outside valid range from "
"0 to %d\n", pcpu, CPU_SETSIZE - 1);
return (-1);
}
if (vcpumap[vcpu] == NULL) {
if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
perror("malloc");
return (-1);
}
CPU_ZERO(vcpumap[vcpu]);
}
CPU_SET(pcpu, vcpumap[vcpu]);
return (0);
}
void
vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
int errcode)
{
struct vmctx *ctx;
int error, restart_instruction;
ctx = arg;
restart_instruction = 1;
error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
restart_instruction);
assert(error == 0);
}
void *
paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
{
return (vm_map_gpa(ctx, gaddr, len));
}
#ifdef BHYVE_SNAPSHOT
uintptr_t
paddr_host2guest(struct vmctx *ctx, void *addr)
{
return (vm_rev_map_gpa(ctx, addr));
}
#endif
int
fbsdrun_vmexit_on_pause(void)
{
return (guest_vmexit_on_pause);
}
int
fbsdrun_vmexit_on_hlt(void)
{
return (guest_vmexit_on_hlt);
}
int
fbsdrun_virtio_msix(void)
{
return (virtio_msix);
}
static void *
fbsdrun_start_thread(void *param)
{
char tname[MAXCOMLEN + 1];
struct mt_vmm_info *mtp;
int vcpu;
mtp = param;
vcpu = mtp->mt_vcpu;
snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
pthread_set_name_np(mtp->mt_thr, tname);
#ifdef BHYVE_SNAPSHOT
checkpoint_cpu_add(vcpu);
#endif
if (gdb_port != 0)
gdb_cpu_add(vcpu);
vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
/* not reached */
exit(1);
return (NULL);
}
void
fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
{
int error;
assert(fromcpu == BSP);
/*
* The 'newcpu' must be activated in the context of 'fromcpu'. If
* vm_activate_cpu() is delayed until newcpu's pthread starts running
* then vmm.ko is out-of-sync with bhyve and this can create a race
* with vm_suspend().
*/
error = vm_activate_cpu(ctx, newcpu);
if (error != 0)
err(EX_OSERR, "could not activate CPU %d", newcpu);
CPU_SET_ATOMIC(newcpu, &cpumask);
/*
* Set up the vmexit struct to allow execution to start
* at the given RIP
*/
vmexit[newcpu].rip = rip;
vmexit[newcpu].inst_length = 0;
mt_vmm_info[newcpu].mt_ctx = ctx;
mt_vmm_info[newcpu].mt_vcpu = newcpu;
error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[newcpu]);
assert(error == 0);
}
static int
fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
{
if (!CPU_ISSET(vcpu, &cpumask)) {
fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
exit(4);
}
CPU_CLR_ATOMIC(vcpu, &cpumask);
return (CPU_EMPTY(&cpumask));
}
static int
vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
uint32_t eax)
{
#if BHYVE_DEBUG
/*
* put guest-driven debug here
*/
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
int bytes, port, in, out;
int vcpu;
vcpu = *pvcpu;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
in = vme->u.inout.in;
out = !in;
/* Extra-special case of host notifications */
if (out && port == GUEST_NIO_PORT) {
error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
return (error);
}
error = emulate_inout(ctx, vcpu, vme, strictio);
if (error) {
fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
in ? "in" : "out",
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
port, vmexit->rip);
return (VMEXIT_ABORT);
} else {
return (VMEXIT_CONTINUE);
}
}
static int
vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
uint64_t val;
uint32_t eax, edx;
int error;
val = 0;
error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
if (error != 0) {
fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
vme->u.msr.code, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_CONTINUE);
}
}
eax = val;
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
assert(error == 0);
edx = val >> 32;
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
assert(error == 0);
return (VMEXIT_CONTINUE);
}
static int
vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
if (error != 0) {
fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
vme->u.msr.code, vme->u.msr.wval, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_CONTINUE);
}
}
return (VMEXIT_CONTINUE);
}
static int
vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
(void)spinup_ap(ctx, *pvcpu,
vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
return (VMEXIT_CONTINUE);
}
#define DEBUG_EPT_MISCONFIG
#ifdef DEBUG_EPT_MISCONFIG
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
static int ept_misconfig_ptenum;
#endif
static const char *
vmexit_vmx_desc(uint32_t exit_reason)
{
if (exit_reason >= nitems(vmx_exit_reason_desc) ||
vmx_exit_reason_desc[exit_reason] == NULL)
return ("Unknown");
return (vmx_exit_reason_desc[exit_reason]);
}
static int
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tVMX\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
fprintf(stderr, "\tqualification\t0x%016lx\n",
vmexit->u.vmx.exit_qualification);
fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
#ifdef DEBUG_EPT_MISCONFIG
if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
vm_get_register(ctx, *pvcpu,
VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
&ept_misconfig_gpa);
vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
&ept_misconfig_ptenum);
fprintf(stderr, "\tEPT misconfiguration:\n");
fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
ept_misconfig_ptenum, ept_misconfig_pte[0],
ept_misconfig_pte[1], ept_misconfig_pte[2],
ept_misconfig_pte[3]);
}
#endif /* DEBUG_EPT_MISCONFIG */
return (VMEXIT_ABORT);
}
static int
vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tSVM\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
return (VMEXIT_ABORT);
}
static int
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_bogus++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_reqidle++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_hlt++;
/*
* Just continue execution with the next instruction. We use
* the HLT VM exit as a way to be friendly with the host
* scheduler.
*/
return (VMEXIT_CONTINUE);
}
static int
vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_pause++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_mtrap++;
#ifdef BHYVE_SNAPSHOT
checkpoint_cpu_suspend(*pvcpu);
#endif
if (gdb_port != 0)
gdb_cpu_mtrap(*pvcpu);
#ifdef BHYVE_SNAPSHOT
checkpoint_cpu_resume(*pvcpu);
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
int err, i, cs_d;
struct vie *vie;
enum vm_cpu_mode mode;
stats.vmexit_inst_emul++;
vie = &vmexit->u.inst_emul.vie;
if (!vie->decoded) {
/*
* Attempt to decode in userspace as a fallback. This allows
* updating instruction decode in bhyve without rebooting the
* kernel (rapid prototyping), albeit with much slower
* emulation.
*/
vie_restart(vie);
mode = vmexit->u.inst_emul.paging.cpu_mode;
cs_d = vmexit->u.inst_emul.cs_d;
if (vmm_decode_instruction(mode, cs_d, vie) != 0)
goto fail;
if (vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RIP,
vmexit->rip + vie->num_processed) != 0)
goto fail;
}
err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
vie, &vmexit->u.inst_emul.paging);
if (err) {
if (err == ESRCH) {
EPRINTLN("Unhandled memory access to 0x%lx\n",
vmexit->u.inst_emul.gpa);
}
goto fail;
}
return (VMEXIT_CONTINUE);
fail:
fprintf(stderr, "Failed to emulate instruction sequence [ ");
for (i = 0; i < vie->num_valid; i++)
fprintf(stderr, "%02x", vie->inst[i]);
FPRINTLN(stderr, " ] at 0x%lx", vmexit->rip);
return (VMEXIT_ABORT);
}
static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
static int
vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
enum vm_suspend_how how;
how = vmexit->u.suspended.how;
fbsdrun_deletecpu(ctx, *pvcpu);
if (*pvcpu != BSP) {
pthread_mutex_lock(&resetcpu_mtx);
pthread_cond_signal(&resetcpu_cond);
pthread_mutex_unlock(&resetcpu_mtx);
pthread_exit(NULL);
}
pthread_mutex_lock(&resetcpu_mtx);
while (!CPU_EMPTY(&cpumask)) {
pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
}
pthread_mutex_unlock(&resetcpu_mtx);
switch (how) {
case VM_SUSPEND_RESET:
exit(0);
case VM_SUSPEND_POWEROFF:
if (destroy_on_poweroff)
vm_destroy(ctx);
exit(1);
case VM_SUSPEND_HALT:
exit(2);
case VM_SUSPEND_TRIPLEFAULT:
exit(3);
default:
fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
exit(100);
}
return (0); /* NOTREACHED */
}
static int
vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
#ifdef BHYVE_SNAPSHOT
checkpoint_cpu_suspend(*pvcpu);
#endif
if (gdb_port != 0)
gdb_cpu_suspend(*pvcpu);
#ifdef BHYVE_SNAPSHOT
checkpoint_cpu_resume(*pvcpu);
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
if (gdb_port == 0) {
fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n");
exit(4);
}
gdb_cpu_breakpoint(*pvcpu, vmexit);
return (VMEXIT_CONTINUE);
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_INOUT] = vmexit_inout,
[VM_EXITCODE_INOUT_STR] = vmexit_inout,
[VM_EXITCODE_VMX] = vmexit_vmx,
[VM_EXITCODE_SVM] = vmexit_svm,
[VM_EXITCODE_BOGUS] = vmexit_bogus,
[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
[VM_EXITCODE_DEBUG] = vmexit_debug,
[VM_EXITCODE_BPT] = vmexit_breakpoint,
};
static void
vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
{
int error, rc;
enum vm_exitcode exitcode;
cpuset_t active_cpus;
if (vcpumap[vcpu] != NULL) {
error = pthread_setaffinity_np(pthread_self(),
sizeof(cpuset_t), vcpumap[vcpu]);
assert(error == 0);
}
error = vm_active_cpus(ctx, &active_cpus);
assert(CPU_ISSET(vcpu, &active_cpus));
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
assert(error == 0);
while (1) {
error = vm_run(ctx, vcpu, &vmexit[vcpu]);
if (error != 0)
break;
exitcode = vmexit[vcpu].exitcode;
if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
exitcode);
exit(4);
}
rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
switch (rc) {
case VMEXIT_CONTINUE:
break;
case VMEXIT_ABORT:
abort();
default:
exit(4);
}
}
fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
}
static int
num_vcpus_allowed(struct vmctx *ctx)
{
int tmp, error;
error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
/*
* The guest is allowed to spinup more than one processor only if the
* UNRESTRICTED_GUEST capability is available.
*/
if (error == 0)
return (VM_MAXCPU);
else
return (1);
}
void
fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
{
int err, tmp;
if (fbsdrun_vmexit_on_hlt()) {
err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
if (err < 0) {
fprintf(stderr, "VM exit on HLT not supported\n");
exit(4);
}
vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
if (cpu == BSP)
handler[VM_EXITCODE_HLT] = vmexit_hlt;
}
if (fbsdrun_vmexit_on_pause()) {
/*
* pause exit support required for this mode
*/
err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
if (err < 0) {
fprintf(stderr,
"SMP mux requested, no pause support\n");
exit(4);
}
vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
if (cpu == BSP)
handler[VM_EXITCODE_PAUSE] = vmexit_pause;
}
if (x2apic_mode)
err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
else
err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
if (err) {
fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
exit(4);
}
vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
}
static struct vmctx *
do_open(const char *vmname)
{
struct vmctx *ctx;
int error;
bool reinit, romboot;
#ifndef WITHOUT_CAPSICUM
cap_rights_t rights;
const cap_ioctl_t *cmds;
size_t ncmds;
#endif
reinit = romboot = false;
if (lpc_bootrom())
romboot = true;
error = vm_create(vmname);
if (error) {
if (errno == EEXIST) {
if (romboot) {
reinit = true;
} else {
/*
* The virtual machine has been setup by the
* userspace bootloader.
*/
}
} else {
perror("vm_create");
exit(4);
}
} else {
if (!romboot) {
/*
* If the virtual machine was just created then a
* bootrom must be configured to boot it.
*/
fprintf(stderr, "virtual machine cannot be booted\n");
exit(4);
}
}
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(4);
}
#ifndef WITHOUT_CAPSICUM
cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1)
errx(EX_OSERR, "Unable to apply rights for sandbox");
vm_get_ioctls(&ncmds);
cmds = vm_get_ioctls(NULL);
if (cmds == NULL)
errx(EX_OSERR, "out of memory");
if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
errx(EX_OSERR, "Unable to apply rights for sandbox");
free((cap_ioctl_t *)cmds);
#endif
if (reinit) {
error = vm_reinit(ctx);
if (error) {
perror("vm_reinit");
exit(4);
}
}
error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
if (error)
errx(EX_OSERR, "vm_set_topology");
return (ctx);
}
void
spinup_vcpu(struct vmctx *ctx, int vcpu)
{
int error;
uint64_t rip;
error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
fbsdrun_set_capabilities(ctx, vcpu);
error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
assert(error == 0);
fbsdrun_addcpu(ctx, BSP, vcpu, rip);
}
int
main(int argc, char *argv[])
{
int c, error, dbg_port, err, bvmcons;
int max_vcpus, mptgen, memflags;
int rtc_localtime;
bool gdb_stop;
struct vmctx *ctx;
uint64_t rip;
size_t memsize;
char *optstr;
#ifdef BHYVE_SNAPSHOT
char *restore_file;
struct restore_state rstate;
int vcpu;
restore_file = NULL;
#endif
bvmcons = 0;
progname = basename(argv[0]);
dbg_port = 0;
gdb_stop = false;
guest_ncpus = 1;
sockets = cores = threads = 1;
maxcpus = 0;
memsize = 256 * MB;
mptgen = 1;
rtc_localtime = 1;
memflags = 0;
#ifdef BHYVE_SNAPSHOT
optstr = "abehuwxACDHIPSWYp:g:G:c:s:m:l:U:r:";
#else
optstr = "abehuwxACDHIPSWYp:g:G:c:s:m:l:U:";
#endif
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
case 'a':
x2apic_mode = 0;
break;
case 'A':
acpi = 1;
break;
case 'b':
warnx("-b flag is deprecated and will be removed in FreeBSD 13.0");
bvmcons = 1;
break;
case 'D':
destroy_on_poweroff = 1;
break;
case 'p':
if (pincpu_parse(optarg) != 0) {
errx(EX_USAGE, "invalid vcpu pinning "
"configuration '%s'", optarg);
}
break;
case 'c':
if (topology_parse(optarg) != 0) {
errx(EX_USAGE, "invalid cpu topology "
"'%s'", optarg);
}
break;
case 'C':
memflags |= VM_MEM_F_INCORE;
break;
case 'g':
warnx("-g flag is deprecated and will be removed in FreeBSD 13.0");
dbg_port = atoi(optarg);
break;
case 'G':
if (optarg[0] == 'w') {
gdb_stop = true;
optarg++;
}
gdb_port = atoi(optarg);
break;
case 'l':
if (strncmp(optarg, "help", strlen(optarg)) == 0) {
lpc_print_supported_devices();
exit(0);
} else if (lpc_device_parse(optarg) != 0) {
errx(EX_USAGE, "invalid lpc device "
"configuration '%s'", optarg);
}
break;
#ifdef BHYVE_SNAPSHOT
case 'r':
restore_file = optarg;
break;
#endif
case 's':
if (strncmp(optarg, "help", strlen(optarg)) == 0) {
pci_print_supported_devices();
exit(0);
} else if (pci_parse_slot(optarg) != 0)
exit(4);
else
break;
case 'S':
memflags |= VM_MEM_F_WIRED;
break;
case 'm':
error = vm_parse_memsize(optarg, &memsize);
if (error)
errx(EX_USAGE, "invalid memsize '%s'", optarg);
break;
case 'H':
guest_vmexit_on_hlt = 1;
break;
case 'I':
/*
* The "-I" option was used to add an ioapic to the
* virtual machine.
*
* An ioapic is now provided unconditionally for each
* virtual machine and this option is now deprecated.
*/
break;
case 'P':
guest_vmexit_on_pause = 1;
break;
case 'e':
strictio = 1;
break;
case 'u':
rtc_localtime = 0;
break;
case 'U':
guest_uuid_str = optarg;
break;
case 'w':
strictmsr = 0;
break;
case 'W':
virtio_msix = 0;
break;
case 'x':
x2apic_mode = 1;
break;
case 'Y':
mptgen = 0;
break;
case 'h':
usage(0);
default:
usage(1);
}
}
argc -= optind;
argv += optind;
#ifdef BHYVE_SNAPSHOT
if (argc > 1 || (argc == 0 && restore_file == NULL))
usage(1);
if (restore_file != NULL) {
error = load_restore_file(restore_file, &rstate);
if (error) {
fprintf(stderr, "Failed to read checkpoint info from "
"file: '%s'.\n", restore_file);
exit(1);
}
}
if (argc == 1) {
vmname = argv[0];
} else {
vmname = lookup_vmname(&rstate);
if (vmname == NULL) {
fprintf(stderr, "Cannot find VM name in restore file. "
"Please specify one.\n");
exit(1);
}
}
#else
if (argc != 1)
usage(1);
vmname = argv[0];
#endif
ctx = do_open(vmname);
#ifdef BHYVE_SNAPSHOT
if (restore_file != NULL) {
guest_ncpus = lookup_guest_ncpus(&rstate);
memflags = lookup_memflags(&rstate);
memsize = lookup_memsize(&rstate);
}
if (guest_ncpus < 1) {
fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
exit(1);
}
#endif
max_vcpus = num_vcpus_allowed(ctx);
if (guest_ncpus > max_vcpus) {
fprintf(stderr, "%d vCPUs requested but only %d available\n",
guest_ncpus, max_vcpus);
exit(4);
}
fbsdrun_set_capabilities(ctx, BSP);
vm_set_memflags(ctx, memflags);
err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
if (err) {
fprintf(stderr, "Unable to setup memory (%d)\n", errno);
exit(4);
}
error = init_msr();
if (error) {
fprintf(stderr, "init_msr error %d", error);
exit(4);
}
init_mem();
init_inout();
kernemu_dev_init();
init_bootrom(ctx);
atkbdc_init(ctx);
pci_irq_init(ctx);
ioapic_init(ctx);
rtc_init(ctx, rtc_localtime);
sci_init(ctx);
/*
* Exit if a device emulation finds an error in its initilization
*/
if (init_pci(ctx) != 0) {
perror("device emulation initialization error");
exit(4);
}
/*
* Initialize after PCI, to allow a bootrom file to reserve the high
* region.
*/
if (acpi)
vmgenc_init(ctx);
if (dbg_port != 0)
init_dbgport(dbg_port);
if (gdb_port != 0)
init_gdb(ctx, gdb_port, gdb_stop);
if (bvmcons)
init_bvmcons();
if (lpc_bootrom()) {
if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
fprintf(stderr, "ROM boot failed: unrestricted guest "
"capability not available\n");
exit(4);
}
error = vcpu_reset(ctx, BSP);
assert(error == 0);
}
#ifdef BHYVE_SNAPSHOT
if (restore_file != NULL) {
- fprintf(stdout, "Pausing pci devs...\r\n");
- if (vm_pause_user_devs(ctx) != 0) {
- fprintf(stderr, "Failed to pause PCI device state.\n");
+ fprintf(stdout, "Pausing devices...\r\n");
+ if (vm_pause_devices(ctx) != 0) {
+ fprintf(stderr, "Failed to pause device state.\n");
exit(1);
}
fprintf(stdout, "Restoring vm mem...\r\n");
if (restore_vm_mem(ctx, &rstate) != 0) {
fprintf(stderr, "Failed to restore VM memory.\n");
exit(1);
}
- fprintf(stdout, "Restoring pci devs...\r\n");
- if (vm_restore_user_devs(ctx, &rstate) != 0) {
+ fprintf(stdout, "Restoring devices...\r\n");
+ if (vm_restore_devices(ctx, &rstate) != 0) {
fprintf(stderr, "Failed to restore PCI device state.\n");
exit(1);
}
fprintf(stdout, "Restoring kernel structs...\r\n");
if (vm_restore_kern_structs(ctx, &rstate) != 0) {
fprintf(stderr, "Failed to restore kernel structs.\n");
exit(1);
}
- fprintf(stdout, "Resuming pci devs...\r\n");
- if (vm_resume_user_devs(ctx) != 0) {
+ fprintf(stdout, "Resuming devices...\r\n");
+ if (vm_resume_devices(ctx) != 0) {
fprintf(stderr, "Failed to resume PCI device state.\n");
exit(1);
}
}
#endif
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
/*
* build the guest tables, MP etc.
*/
if (mptgen) {
error = mptable_build(ctx, guest_ncpus);
if (error) {
perror("error to build the guest tables");
exit(4);
}
}
error = smbios_build(ctx);
assert(error == 0);
if (acpi) {
error = acpi_build(ctx, guest_ncpus);
assert(error == 0);
}
if (lpc_bootrom())
fwctl_init();
/*
* Change the proc title to include the VM name.
*/
setproctitle("%s", vmname);
#ifndef WITHOUT_CAPSICUM
caph_cache_catpages();
if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
errx(EX_OSERR, "Unable to apply rights for sandbox");
if (caph_enter() == -1)
errx(EX_OSERR, "cap_enter() failed");
#endif
#ifdef BHYVE_SNAPSHOT
if (restore_file != NULL)
destroy_restore_state(&rstate);
/*
* checkpointing thread for communication with bhyvectl
*/
if (init_checkpoint_thread(ctx) < 0)
printf("Failed to start checkpoint thread!\r\n");
if (restore_file != NULL)
vm_restore_time(ctx);
#endif
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, BSP, rip);
#ifdef BHYVE_SNAPSHOT
/*
* If we restore a VM, start all vCPUs now (including APs), otherwise,
* let the guest OS to spin them up later via vmexits.
*/
if (restore_file != NULL) {
for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
if (vcpu == BSP)
continue;
fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu);
spinup_vcpu(ctx, vcpu);
}
}
#endif
/*
* Head off to the main event dispatch loop
*/
mevent_dispatch();
exit(4);
}
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 64c1fe0b90ca..24cf58f6bf92 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -1,1043 +1,1046 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2013 Zhixiang Yu <zcore@freebsd.org>
* Copyright (c) 2015-2016 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <sys/disk.h>
#include <sys/ata.h>
#include <sys/endian.h>
#include <machine/vmm_snapshot.h>
+#ifdef BHYVE_SNAPSHOT
+#include "snapshot.h"
+#endif
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include <pthread_np.h>
#include <inttypes.h>
#include <md5.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "ahci.h"
#include "block_if.h"
#define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */
#define MAX_PORTS 32 /* AHCI supports 32 ports */
#define PxSIG_ATA 0x00000101 /* ATA drive */
#define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */
enum sata_fis_type {
FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */
FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */
FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */
FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */
FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */
FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */
FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */
FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */
};
/*
* SCSI opcodes
*/
#define TEST_UNIT_READY 0x00
#define REQUEST_SENSE 0x03
#define INQUIRY 0x12
#define START_STOP_UNIT 0x1B
#define PREVENT_ALLOW 0x1E
#define READ_CAPACITY 0x25
#define READ_10 0x28
#define POSITION_TO_ELEMENT 0x2B
#define READ_TOC 0x43
#define GET_EVENT_STATUS_NOTIFICATION 0x4A
#define MODE_SENSE_10 0x5A
#define REPORT_LUNS 0xA0
#define READ_12 0xA8
#define READ_CD 0xBE
/*
* SCSI mode page codes
*/
#define MODEPAGE_RW_ERROR_RECOVERY 0x01
#define MODEPAGE_CD_CAPABILITIES 0x2A
/*
* ATA commands
*/
#define ATA_SF_ENAB_SATA_SF 0x10
#define ATA_SATA_SF_AN 0x05
#define ATA_SF_DIS_SATA_SF 0x90
/*
* Debug printf
*/
#ifdef AHCI_DEBUG
static FILE *dbg;
#define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0)
#else
#define DPRINTF(format, arg...)
#endif
#define WPRINTF(format, arg...) printf(format, ##arg)
#define AHCI_PORT_IDENT 20 + 1
struct ahci_ioreq {
struct blockif_req io_req;
struct ahci_port *io_pr;
STAILQ_ENTRY(ahci_ioreq) io_flist;
TAILQ_ENTRY(ahci_ioreq) io_blist;
uint8_t *cfis;
uint32_t len;
uint32_t done;
int slot;
int more;
int readop;
};
struct ahci_port {
struct blockif_ctxt *bctx;
struct pci_ahci_softc *pr_sc;
struct ata_params ata_ident;
uint8_t *cmd_lst;
uint8_t *rfis;
int port;
int atapi;
int reset;
int waitforclear;
int mult_sectors;
uint8_t xfermode;
uint8_t err_cfis[20];
uint8_t sense_key;
uint8_t asc;
u_int ccs;
uint32_t pending;
uint32_t clb;
uint32_t clbu;
uint32_t fb;
uint32_t fbu;
uint32_t is;
uint32_t ie;
uint32_t cmd;
uint32_t unused0;
uint32_t tfd;
uint32_t sig;
uint32_t ssts;
uint32_t sctl;
uint32_t serr;
uint32_t sact;
uint32_t ci;
uint32_t sntf;
uint32_t fbs;
/*
* i/o request info
*/
struct ahci_ioreq *ioreq;
int ioqsz;
STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
};
struct ahci_cmd_hdr {
uint16_t flags;
uint16_t prdtl;
uint32_t prdbc;
uint64_t ctba;
uint32_t reserved[4];
};
struct ahci_prdt_entry {
uint64_t dba;
uint32_t reserved;
#define DBCMASK 0x3fffff
uint32_t dbc;
};
struct pci_ahci_softc {
struct pci_devinst *asc_pi;
pthread_mutex_t mtx;
int ports;
uint32_t cap;
uint32_t ghc;
uint32_t is;
uint32_t pi;
uint32_t vs;
uint32_t ccc_ctl;
uint32_t ccc_pts;
uint32_t em_loc;
uint32_t em_ctl;
uint32_t cap2;
uint32_t bohc;
uint32_t lintr;
struct ahci_port port[MAX_PORTS];
};
#define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx)
static void ahci_handle_port(struct ahci_port *p);
static inline void lba_to_msf(uint8_t *buf, int lba)
{
lba += 150;
buf[0] = (lba / 75) / 60;
buf[1] = (lba / 75) % 60;
buf[2] = lba % 75;
}
/*
* Generate HBA interrupts on global IS register write.
*/
static void
ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask)
{
struct pci_devinst *pi = sc->asc_pi;
struct ahci_port *p;
int i, nmsg;
uint32_t mmask;
/* Update global IS from PxIS/PxIE. */
for (i = 0; i < sc->ports; i++) {
p = &sc->port[i];
if (p->is & p->ie)
sc->is |= (1 << i);
}
DPRINTF("%s(%08x) %08x", __func__, mask, sc->is);
/* If there is nothing enabled -- clear legacy interrupt and exit. */
if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) {
if (sc->lintr) {
pci_lintr_deassert(pi);
sc->lintr = 0;
}
return;
}
/* If there is anything and no MSI -- assert legacy interrupt. */
nmsg = pci_msi_maxmsgnum(pi);
if (nmsg == 0) {
if (!sc->lintr) {
sc->lintr = 1;
pci_lintr_assert(pi);
}
return;
}
/* Assert respective MSIs for ports that were touched. */
for (i = 0; i < nmsg; i++) {
if (sc->ports <= nmsg || i < nmsg - 1)
mmask = 1 << i;
else
mmask = 0xffffffff << i;
if (sc->is & mask && mmask & mask)
pci_generate_msi(pi, i);
}
}
/*
* Generate HBA interrupt on specific port event.
*/
static void
ahci_port_intr(struct ahci_port *p)
{
struct pci_ahci_softc *sc = p->pr_sc;
struct pci_devinst *pi = sc->asc_pi;
int nmsg;
DPRINTF("%s(%d) %08x/%08x %08x", __func__,
p->port, p->is, p->ie, sc->is);
/* If there is nothing enabled -- we are done. */
if ((p->is & p->ie) == 0)
return;
/* In case of non-shared MSI always generate interrupt. */
nmsg = pci_msi_maxmsgnum(pi);
if (sc->ports <= nmsg || p->port < nmsg - 1) {
sc->is |= (1 << p->port);
if ((sc->ghc & AHCI_GHC_IE) == 0)
return;
pci_generate_msi(pi, p->port);
return;
}
/* If IS for this port is already set -- do nothing. */
if (sc->is & (1 << p->port))
return;
sc->is |= (1 << p->port);
/* If interrupts are enabled -- generate one. */
if ((sc->ghc & AHCI_GHC_IE) == 0)
return;
if (nmsg > 0) {
pci_generate_msi(pi, nmsg - 1);
} else if (!sc->lintr) {
sc->lintr = 1;
pci_lintr_assert(pi);
}
}
static void
ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
{
int offset, len, irq;
if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE))
return;
switch (ft) {
case FIS_TYPE_REGD2H:
offset = 0x40;
len = 20;
irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0;
break;
case FIS_TYPE_SETDEVBITS:
offset = 0x58;
len = 8;
irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0;
break;
case FIS_TYPE_PIOSETUP:
offset = 0x20;
len = 20;
irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0;
break;
default:
WPRINTF("unsupported fis type %d", ft);
return;
}
if (fis[2] & ATA_S_ERROR) {
p->waitforclear = 1;
irq |= AHCI_P_IX_TFE;
}
memcpy(p->rfis + offset, fis, len);
if (irq) {
if (~p->is & irq) {
p->is |= irq;
ahci_port_intr(p);
}
}
}
static void
ahci_write_fis_piosetup(struct ahci_port *p)
{
uint8_t fis[20];
memset(fis, 0, sizeof(fis));
fis[0] = FIS_TYPE_PIOSETUP;
ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis);
}
static void
ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
{
uint8_t fis[8];
uint8_t error;
error = (tfd >> 8) & 0xff;
tfd &= 0x77;
memset(fis, 0, sizeof(fis));
fis[0] = FIS_TYPE_SETDEVBITS;
fis[1] = (1 << 6);
fis[2] = tfd;
fis[3] = error;
if (fis[2] & ATA_S_ERROR) {
p->err_cfis[0] = slot;
p->err_cfis[2] = tfd;
p->err_cfis[3] = error;
memcpy(&p->err_cfis[4], cfis + 4, 16);
} else {
*(uint32_t *)(fis + 4) = (1 << slot);
p->sact &= ~(1 << slot);
}
p->tfd &= ~0x77;
p->tfd |= tfd;
ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
}
static void
ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
{
uint8_t fis[20];
uint8_t error;
error = (tfd >> 8) & 0xff;
memset(fis, 0, sizeof(fis));
fis[0] = FIS_TYPE_REGD2H;
fis[1] = (1 << 6);
fis[2] = tfd & 0xff;
fis[3] = error;
fis[4] = cfis[4];
fis[5] = cfis[5];
fis[6] = cfis[6];
fis[7] = cfis[7];
fis[8] = cfis[8];
fis[9] = cfis[9];
fis[10] = cfis[10];
fis[11] = cfis[11];
fis[12] = cfis[12];
fis[13] = cfis[13];
if (fis[2] & ATA_S_ERROR) {
p->err_cfis[0] = 0x80;
p->err_cfis[2] = tfd & 0xff;
p->err_cfis[3] = error;
memcpy(&p->err_cfis[4], cfis + 4, 16);
} else
p->ci &= ~(1 << slot);
p->tfd = tfd;
ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
}
static void
ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot)
{
uint8_t fis[20];
p->tfd = ATA_S_READY | ATA_S_DSC;
memset(fis, 0, sizeof(fis));
fis[0] = FIS_TYPE_REGD2H;
fis[1] = 0; /* No interrupt */
fis[2] = p->tfd; /* Status */
fis[3] = 0; /* No error */
p->ci &= ~(1 << slot);
ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
}
static void
ahci_write_reset_fis_d2h(struct ahci_port *p)
{
uint8_t fis[20];
memset(fis, 0, sizeof(fis));
fis[0] = FIS_TYPE_REGD2H;
fis[3] = 1;
fis[4] = 1;
if (p->atapi) {
fis[5] = 0x14;
fis[6] = 0xeb;
}
fis[12] = 1;
ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
}
static void
ahci_check_stopped(struct ahci_port *p)
{
/*
* If we are no longer processing the command list and nothing
* is in-flight, clear the running bit, the current command
* slot, the command issue and active bits.
*/
if (!(p->cmd & AHCI_P_CMD_ST)) {
if (p->pending == 0) {
p->ccs = 0;
p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
p->ci = 0;
p->sact = 0;
p->waitforclear = 0;
}
}
}
static void
ahci_port_stop(struct ahci_port *p)
{
struct ahci_ioreq *aior;
uint8_t *cfis;
int slot;
int error;
assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
/*
* Try to cancel the outstanding blockif request.
*/
error = blockif_cancel(p->bctx, &aior->io_req);
if (error != 0)
continue;
slot = aior->slot;
cfis = aior->cfis;
if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
cfis[2] == ATA_READ_FPDMA_QUEUED ||
cfis[2] == ATA_SEND_FPDMA_QUEUED)
p->sact &= ~(1 << slot); /* NCQ */
else
p->ci &= ~(1 << slot);
/*
* This command is now done.
*/
p->pending &= ~(1 << slot);
/*
* Delete the blockif request from the busy list
*/
TAILQ_REMOVE(&p->iobhd, aior, io_blist);
/*
* Move the blockif request back to the free list
*/
STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
}
ahci_check_stopped(p);
}
static void
ahci_port_reset(struct ahci_port *pr)
{
pr->serr = 0;
pr->sact = 0;
pr->xfermode = ATA_UDMA6;
pr->mult_sectors = 128;
if (!pr->bctx) {
pr->ssts = ATA_SS_DET_NO_DEVICE;
pr->sig = 0xFFFFFFFF;
pr->tfd = 0x7F;
return;
}
pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE;
if (pr->sctl & ATA_SC_SPD_MASK)
pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK);
else
pr->ssts |= ATA_SS_SPD_GEN3;
pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
if (!pr->atapi) {
pr->sig = PxSIG_ATA;
pr->tfd |= ATA_S_READY;
} else
pr->sig = PxSIG_ATAPI;
ahci_write_reset_fis_d2h(pr);
}
static void
ahci_reset(struct pci_ahci_softc *sc)
{
int i;
sc->ghc = AHCI_GHC_AE;
sc->is = 0;
if (sc->lintr) {
pci_lintr_deassert(sc->asc_pi);
sc->lintr = 0;
}
for (i = 0; i < sc->ports; i++) {
sc->port[i].ie = 0;
sc->port[i].is = 0;
sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD);
if (sc->port[i].bctx)
sc->port[i].cmd |= AHCI_P_CMD_CPS;
sc->port[i].sctl = 0;
ahci_port_reset(&sc->port[i]);
}
}
static void
ata_string(uint8_t *dest, const char *src, int len)
{
int i;
for (i = 0; i < len; i++) {
if (*src)
dest[i ^ 1] = *src++;
else
dest[i ^ 1] = ' ';
}
}
static void
atapi_string(uint8_t *dest, const char *src, int len)
{
int i;
for (i = 0; i < len; i++) {
if (*src)
dest[i] = *src++;
else
dest[i] = ' ';
}
}
/*
* Build up the iovec based on the PRDT, 'done' and 'len'.
*/
static void
ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior,
struct ahci_prdt_entry *prdt, uint16_t prdtl)
{
struct blockif_req *breq = &aior->io_req;
int i, j, skip, todo, left, extra;
uint32_t dbcsz;
/* Copy part of PRDT between 'done' and 'len' bytes into the iov. */
skip = aior->done;
left = aior->len - aior->done;
todo = 0;
for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0;
i++, prdt++) {
dbcsz = (prdt->dbc & DBCMASK) + 1;
/* Skip already done part of the PRDT */
if (dbcsz <= skip) {
skip -= dbcsz;
continue;
}
dbcsz -= skip;
if (dbcsz > left)
dbcsz = left;
breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc),
prdt->dba + skip, dbcsz);
breq->br_iov[j].iov_len = dbcsz;
todo += dbcsz;
left -= dbcsz;
skip = 0;
j++;
}
/* If we got limited by IOV length, round I/O down to sector size. */
if (j == BLOCKIF_IOV_MAX) {
extra = todo % blockif_sectsz(p->bctx);
todo -= extra;
assert(todo > 0);
while (extra > 0) {
if (breq->br_iov[j - 1].iov_len > extra) {
breq->br_iov[j - 1].iov_len -= extra;
break;
}
extra -= breq->br_iov[j - 1].iov_len;
j--;
}
}
breq->br_iovcnt = j;
breq->br_resid = todo;
aior->done += todo;
aior->more = (aior->done < aior->len && i < prdtl);
}
static void
ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
{
struct ahci_ioreq *aior;
struct blockif_req *breq;
struct ahci_prdt_entry *prdt;
struct ahci_cmd_hdr *hdr;
uint64_t lba;
uint32_t len;
int err, first, ncq, readop;
prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
ncq = 0;
readop = 1;
first = (done == 0);
if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 ||
cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 ||
cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
cfis[2] == ATA_WRITE_FPDMA_QUEUED)
readop = 0;
if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
cfis[2] == ATA_READ_FPDMA_QUEUED) {
lba = ((uint64_t)cfis[10] << 40) |
((uint64_t)cfis[9] << 32) |
((uint64_t)cfis[8] << 24) |
((uint64_t)cfis[6] << 16) |
((uint64_t)cfis[5] << 8) |
cfis[4];
len = cfis[11] << 8 | cfis[3];
if (!len)
len = 65536;
ncq = 1;
} else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 ||
cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 ||
cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
lba = ((uint64_t)cfis[10] << 40) |
((uint64_t)cfis[9] << 32) |
((uint64_t)cfis[8] << 24) |
((uint64_t)cfis[6] << 16) |
((uint64_t)cfis[5] << 8) |
cfis[4];
len = cfis[13] << 8 | cfis[12];
if (!len)
len = 65536;
} else {
lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) |
(cfis[5] << 8) | cfis[4];
len = cfis[12];
if (!len)
len = 256;
}
lba *= blockif_sectsz(p->bctx);
len *= blockif_sectsz(p->bctx);
/* Pull request off free list */
aior = STAILQ_FIRST(&p->iofhd);
assert(aior != NULL);
STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
aior->cfis = cfis;
aior->slot = slot;
aior->len = len;
aior->done = done;
aior->readop = readop;
breq = &aior->io_req;
breq->br_offset = lba + done;
ahci_build_iov(p, aior, prdt, hdr->prdtl);
/* Mark this command in-flight. */
p->pending |= 1 << slot;
/* Stuff request onto busy list. */
TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
if (ncq && first)
ahci_write_fis_d2h_ncq(p, slot);
if (readop)
err = blockif_read(p->bctx, breq);
else
err = blockif_write(p->bctx, breq);
assert(err == 0);
}
static void
ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
{
struct ahci_ioreq *aior;
struct blockif_req *breq;
int err;
/*
* Pull request off free list
*/
aior = STAILQ_FIRST(&p->iofhd);
assert(aior != NULL);
STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
aior->cfis = cfis;
aior->slot = slot;
aior->len = 0;
aior->done = 0;
aior->more = 0;
breq = &aior->io_req;
/*
* Mark this command in-flight.
*/
p->pending |= 1 << slot;
/*
* Stuff request onto busy list
*/
TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
err = blockif_flush(p->bctx, breq);
assert(err == 0);
}
static inline void
read_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
void *buf, int size)
{
struct ahci_cmd_hdr *hdr;
struct ahci_prdt_entry *prdt;
void *to;
int i, len;
hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
len = size;
to = buf;
prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
for (i = 0; i < hdr->prdtl && len; i++) {
uint8_t *ptr;
uint32_t dbcsz;
int sublen;
dbcsz = (prdt->dbc & DBCMASK) + 1;
ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
sublen = MIN(len, dbcsz);
memcpy(to, ptr, sublen);
len -= sublen;
to += sublen;
prdt++;
}
}
static void
ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
{
struct ahci_ioreq *aior;
struct blockif_req *breq;
uint8_t *entry;
uint64_t elba;
uint32_t len, elen;
int err, first, ncq;
uint8_t buf[512];
first = (done == 0);
if (cfis[2] == ATA_DATA_SET_MANAGEMENT) {
len = (uint16_t)cfis[13] << 8 | cfis[12];
len *= 512;
ncq = 0;
} else { /* ATA_SEND_FPDMA_QUEUED */
len = (uint16_t)cfis[11] << 8 | cfis[3];
len *= 512;
ncq = 1;
}
read_prdt(p, slot, cfis, buf, sizeof(buf));
next:
entry = &buf[done];
elba = ((uint64_t)entry[5] << 40) |
((uint64_t)entry[4] << 32) |
((uint64_t)entry[3] << 24) |
((uint64_t)entry[2] << 16) |
((uint64_t)entry[1] << 8) |
entry[0];
elen = (uint16_t)entry[7] << 8 | entry[6];
done += 8;
if (elen == 0) {
if (done >= len) {
if (ncq) {
if (first)
ahci_write_fis_d2h_ncq(p, slot);
ahci_write_fis_sdb(p, slot, cfis,
ATA_S_READY | ATA_S_DSC);
} else {
ahci_write_fis_d2h(p, slot, cfis,
ATA_S_READY | ATA_S_DSC);
}
p->pending &= ~(1 << slot);
ahci_check_stopped(p);
if (!first)
ahci_handle_port(p);
return;
}
goto next;
}
/*
* Pull request off free list
*/
aior = STAILQ_FIRST(&p->iofhd);
assert(aior != NULL);
STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
aior->cfis = cfis;
aior->slot = slot;
aior->len = len;
aior->done = done;
aior->more = (len != done);
breq = &aior->io_req;
breq->br_offset = elba * blockif_sectsz(p->bctx);
breq->br_resid = elen * blockif_sectsz(p->bctx);
/*
* Mark this command in-flight.
*/
p->pending |= 1 << slot;
/*
* Stuff request onto busy list
*/
TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
if (ncq && first)
ahci_write_fis_d2h_ncq(p, slot);
err = blockif_delete(p->bctx, breq);
assert(err == 0);
}
static inline void
write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
void *buf, int size)
{
struct ahci_cmd_hdr *hdr;
struct ahci_prdt_entry *prdt;
void *from;
int i, len;
hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
len = size;
from = buf;
prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
for (i = 0; i < hdr->prdtl && len; i++) {
uint8_t *ptr;
uint32_t dbcsz;
int sublen;
dbcsz = (prdt->dbc & DBCMASK) + 1;
ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
sublen = MIN(len, dbcsz);
memcpy(ptr, from, sublen);
len -= sublen;
from += sublen;
prdt++;
}
hdr->prdbc = size - len;
}
static void
ahci_checksum(uint8_t *buf, int size)
{
int i;
uint8_t sum = 0;
for (i = 0; i < size - 1; i++)
sum += buf[i];
buf[size - 1] = 0x100 - sum;
}
static void
ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis)
{
struct ahci_cmd_hdr *hdr;
uint32_t buf[128];
uint8_t *buf8 = (uint8_t *)buf;
uint16_t *buf16 = (uint16_t *)buf;
hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 ||
cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) {
ahci_write_fis_d2h(p, slot, cfis,
(ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
return;
}
memset(buf, 0, sizeof(buf));
if (cfis[4] == 0x00) { /* Log directory */
buf16[0x00] = 1; /* Version -- 1 */
buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */
buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */
} else if (cfis[4] == 0x10) { /* NCQ Command Error Log */
memcpy(buf8, p->err_cfis, sizeof(p->err_cfis));
ahci_checksum(buf8, sizeof(buf));
} else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */
if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) {
buf[0x00] = 1; /* SFQ DSM supported */
buf[0x01] = 1; /* SFQ DSM TRIM supported */
}
} else {
ahci_write_fis_d2h(p, slot, cfis,
(ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
return;
}
if (cfis[2] == ATA_READ_LOG_EXT)
ahci_write_fis_piosetup(p);
write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
}
static void
handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
{
struct ahci_cmd_hdr *hdr;
hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
if (p->atapi || hdr->prdtl == 0) {
ahci_write_fis_d2h(p, slot, cfis,
(ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
} else {
ahci_write_fis_piosetup(p);
write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params));
ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
}
}
static void
ata_identify_init(struct ahci_port* p, int atapi)
{
struct ata_params* ata_ident = &p->ata_ident;
if (atapi) {
ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM |
ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST;
ata_ident->capabilities1 = ATA_SUPPORT_LBA |
ATA_SUPPORT_DMA;
ata_ident->capabilities2 = (1 << 14 | 1);
ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88;
ata_ident->obsolete62 = 0x3f;
ata_ident->mwdmamodes = 7;
if (p->xfermode & ATA_WDMA0)
ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8));
ata_ident->apiomodes = 3;
ata_ident->mwdmamin = 0x0078;
ata_ident->mwdmarec = 0x0078;
ata_ident->pioblind = 0x0078;
ata_ident->pioiordy = 0x0078;
ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3);
ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3);
ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM;
ata_ident->version_major = 0x3f0;
ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
ata_ident->support.command2 = (1 << 14);
ata_ident->support.extension = (1 << 14);
ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
ata_ident->enabled.extension = (1 << 14);
ata_ident->udmamodes = 0x7f;
if (p->xfermode & ATA_UDMA0)
ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8));
ata_ident->transport_major = 0x1020;
ata_ident->integrity = 0x00a5;
} else {
uint64_t sectors;
int sectsz, psectsz, psectoff, candelete, ro;
uint16_t cyl;
uint8_t sech, heads;
ro = blockif_is_ro(p->bctx);
candelete = blockif_candelete(p->bctx);
sectsz = blockif_sectsz(p->bctx);
sectors = blockif_size(p->bctx) / sectsz;
blockif_chs(p->bctx, &cyl, &heads, &sech);
diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c
index 2d09c024f258..8b2c5117bab9 100644
--- a/usr.sbin/bhyve/pci_e82545.c
+++ b/usr.sbin/bhyve/pci_e82545.c
@@ -1,1049 +1,1053 @@
/*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
* Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
* Copyright (c) 2013 Jeremiah Lott, Avere Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#ifndef WITHOUT_CAPSICUM
#include <sys/capsicum.h>
#endif
#include <sys/limits.h>
#include <sys/ioctl.h>
#include <sys/uio.h>
#include <net/ethernet.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#ifndef WITHOUT_CAPSICUM
#include <capsicum_helpers.h>
#endif
#include <machine/vmm_snapshot.h>
+#ifdef BHYVE_SNAPSHOT
+#include "snapshot.h"
+#endif
+
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <md5.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <unistd.h>
#include <pthread.h>
#include <pthread_np.h>
#include "e1000_regs.h"
#include "e1000_defines.h"
#include "mii.h"
#include "bhyverun.h"
#include "debug.h"
#include "pci_emul.h"
#include "mevent.h"
#include "net_utils.h"
#include "net_backends.h"
/* Hardware/register definitions XXX: move some to common code. */
#define E82545_VENDOR_ID_INTEL 0x8086
#define E82545_DEV_ID_82545EM_COPPER 0x100F
#define E82545_SUBDEV_ID 0x1008
#define E82545_REVISION_4 4
#define E82545_MDIC_DATA_MASK 0x0000FFFF
#define E82545_MDIC_OP_MASK 0x0c000000
#define E82545_MDIC_IE 0x20000000
#define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */
#define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */
#define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */
#define E82545_BAR_REGISTER 0
#define E82545_BAR_REGISTER_LEN (128*1024)
#define E82545_BAR_FLASH 1
#define E82545_BAR_FLASH_LEN (64*1024)
#define E82545_BAR_IO 2
#define E82545_BAR_IO_LEN 8
#define E82545_IOADDR 0x00000000
#define E82545_IODATA 0x00000004
#define E82545_IO_REGISTER_MAX 0x0001FFFF
#define E82545_IO_FLASH_BASE 0x00080000
#define E82545_IO_FLASH_MAX 0x000FFFFF
#define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2))
#define E82545_RAR_MAX 15
#define E82545_MTA_MAX 127
#define E82545_VFTA_MAX 127
/* Slightly modified from the driver versions, hardcoded for 3 opcode bits,
* followed by 6 address bits.
* TODO: make opcode bits and addr bits configurable?
* NVM Commands - Microwire */
#define E82545_NVM_OPCODE_BITS 3
#define E82545_NVM_ADDR_BITS 6
#define E82545_NVM_DATA_BITS 16
#define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS)
#define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1)
#define E82545_NVM_OPCODE_MASK \
(((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS)
#define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */
#define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */
#define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */
#define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */
#define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */
#define E1000_ICR_SRPD 0x00010000
/* This is an arbitrary number. There is no hard limit on the chip. */
#define I82545_MAX_TXSEGS 64
/* Legacy receive descriptor */
struct e1000_rx_desc {
uint64_t buffer_addr; /* Address of the descriptor's data buffer */
uint16_t length; /* Length of data DMAed into data buffer */
uint16_t csum; /* Packet checksum */
uint8_t status; /* Descriptor status */
uint8_t errors; /* Descriptor Errors */
uint16_t special;
};
/* Transmit descriptor types */
#define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000)
#define E1000_TXD_TYP_L (0)
#define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C)
#define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)
/* Legacy transmit descriptor */
struct e1000_tx_desc {
uint64_t buffer_addr; /* Address of the descriptor's data buffer */
union {
uint32_t data;
struct {
uint16_t length; /* Data buffer length */
uint8_t cso; /* Checksum offset */
uint8_t cmd; /* Descriptor control */
} flags;
} lower;
union {
uint32_t data;
struct {
uint8_t status; /* Descriptor status */
uint8_t css; /* Checksum start */
uint16_t special;
} fields;
} upper;
};
/* Context descriptor */
struct e1000_context_desc {
union {
uint32_t ip_config;
struct {
uint8_t ipcss; /* IP checksum start */
uint8_t ipcso; /* IP checksum offset */
uint16_t ipcse; /* IP checksum end */
} ip_fields;
} lower_setup;
union {
uint32_t tcp_config;
struct {
uint8_t tucss; /* TCP checksum start */
uint8_t tucso; /* TCP checksum offset */
uint16_t tucse; /* TCP checksum end */
} tcp_fields;
} upper_setup;
uint32_t cmd_and_length;
union {
uint32_t data;
struct {
uint8_t status; /* Descriptor status */
uint8_t hdr_len; /* Header length */
uint16_t mss; /* Maximum segment size */
} fields;
} tcp_seg_setup;
};
/* Data descriptor */
struct e1000_data_desc {
uint64_t buffer_addr; /* Address of the descriptor's buffer address */
union {
uint32_t data;
struct {
uint16_t length; /* Data buffer length */
uint8_t typ_len_ext;
uint8_t cmd;
} flags;
} lower;
union {
uint32_t data;
struct {
uint8_t status; /* Descriptor status */
uint8_t popts; /* Packet Options */
uint16_t special;
} fields;
} upper;
};
union e1000_tx_udesc {
struct e1000_tx_desc td;
struct e1000_context_desc cd;
struct e1000_data_desc dd;
};
/* Tx checksum info for a packet. */
struct ck_info {
int ck_valid; /* ck_info is valid */
uint8_t ck_start; /* start byte of cksum calcuation */
uint8_t ck_off; /* offset of cksum insertion */
uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */
};
/*
* Debug printf
*/
static int e82545_debug = 0;
#define WPRINTF(msg,params...) PRINTLN("e82545: " msg, params)
#define DPRINTF(msg,params...) if (e82545_debug) WPRINTF(msg, params)
#define MIN(a,b) (((a)<(b))?(a):(b))
#define MAX(a,b) (((a)>(b))?(a):(b))
/* s/w representation of the RAL/RAH regs */
struct eth_uni {
int eu_valid;
int eu_addrsel;
struct ether_addr eu_eth;
};
struct e82545_softc {
struct pci_devinst *esc_pi;
struct vmctx *esc_ctx;
struct mevent *esc_mevpitr;
pthread_mutex_t esc_mtx;
struct ether_addr esc_mac;
net_backend_t *esc_be;
/* General */
uint32_t esc_CTRL; /* x0000 device ctl */
uint32_t esc_FCAL; /* x0028 flow ctl addr lo */
uint32_t esc_FCAH; /* x002C flow ctl addr hi */
uint32_t esc_FCT; /* x0030 flow ctl type */
uint32_t esc_VET; /* x0038 VLAN eth type */
uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */
uint32_t esc_LEDCTL; /* x0E00 LED control */
uint32_t esc_PBA; /* x1000 pkt buffer allocation */
/* Interrupt control */
int esc_irq_asserted;
uint32_t esc_ICR; /* x00C0 cause read/clear */
uint32_t esc_ITR; /* x00C4 intr throttling */
uint32_t esc_ICS; /* x00C8 cause set */
uint32_t esc_IMS; /* x00D0 mask set/read */
uint32_t esc_IMC; /* x00D8 mask clear */
/* Transmit */
union e1000_tx_udesc *esc_txdesc;
struct e1000_context_desc esc_txctx;
pthread_t esc_tx_tid;
pthread_cond_t esc_tx_cond;
int esc_tx_enabled;
int esc_tx_active;
uint32_t esc_TXCW; /* x0178 transmit config */
uint32_t esc_TCTL; /* x0400 transmit ctl */
uint32_t esc_TIPG; /* x0410 inter-packet gap */
uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */
uint64_t esc_tdba; /* verified 64-bit desc table addr */
uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */
uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */
uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */
uint16_t esc_TDH; /* x3810 desc table head idx */
uint16_t esc_TDHr; /* internal read version of TDH */
uint16_t esc_TDT; /* x3818 desc table tail idx */
uint32_t esc_TIDV; /* x3820 intr delay */
uint32_t esc_TXDCTL; /* x3828 desc control */
uint32_t esc_TADV; /* x382C intr absolute delay */
/* L2 frame acceptance */
struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */
uint32_t esc_fmcast[128]; /* Multicast filter bit-match */
uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */
/* Receive */
struct e1000_rx_desc *esc_rxdesc;
pthread_cond_t esc_rx_cond;
int esc_rx_enabled;
int esc_rx_active;
int esc_rx_loopback;
uint32_t esc_RCTL; /* x0100 receive ctl */
uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */
uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */
uint64_t esc_rdba; /* verified 64-bit desc table addr */
uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */
uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/
uint32_t esc_RDLEN; /* x2808 #descriptors */
uint16_t esc_RDH; /* x2810 desc table head idx */
uint16_t esc_RDT; /* x2818 desc table tail idx */
uint32_t esc_RDTR; /* x2820 intr delay */
uint32_t esc_RXDCTL; /* x2828 desc control */
uint32_t esc_RADV; /* x282C intr absolute delay */
uint32_t esc_RSRPD; /* x2C00 recv small packet detect */
uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */
/* IO Port register access */
uint32_t io_addr;
/* Shadow copy of MDIC */
uint32_t mdi_control;
/* Shadow copy of EECD */
uint32_t eeprom_control;
/* Latest NVM in/out */
uint16_t nvm_data;
uint16_t nvm_opaddr;
/* stats */
uint32_t missed_pkt_count; /* dropped for no room in rx queue */
uint32_t pkt_rx_by_size[6];
uint32_t pkt_tx_by_size[6];
uint32_t good_pkt_rx_count;
uint32_t bcast_pkt_rx_count;
uint32_t mcast_pkt_rx_count;
uint32_t good_pkt_tx_count;
uint32_t bcast_pkt_tx_count;
uint32_t mcast_pkt_tx_count;
uint32_t oversize_rx_count;
uint32_t tso_tx_count;
uint64_t good_octets_rx;
uint64_t good_octets_tx;
uint64_t missed_octets; /* counts missed and oversized */
uint8_t nvm_bits:6; /* number of bits remaining in/out */
uint8_t nvm_mode:2;
#define E82545_NVM_MODE_OPADDR 0x0
#define E82545_NVM_MODE_DATAIN 0x1
#define E82545_NVM_MODE_DATAOUT 0x2
/* EEPROM data */
uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE];
};
static void e82545_reset(struct e82545_softc *sc, int dev);
static void e82545_rx_enable(struct e82545_softc *sc);
static void e82545_rx_disable(struct e82545_softc *sc);
static void e82545_rx_callback(int fd, enum ev_type type, void *param);
static void e82545_tx_start(struct e82545_softc *sc);
static void e82545_tx_enable(struct e82545_softc *sc);
static void e82545_tx_disable(struct e82545_softc *sc);
static inline int
e82545_size_stat_index(uint32_t size)
{
if (size <= 64) {
return 0;
} else if (size >= 1024) {
return 5;
} else {
/* should be 1-4 */
return (ffs(size) - 6);
}
}
static void
e82545_init_eeprom(struct e82545_softc *sc)
{
uint16_t checksum, i;
/* mac addr */
sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) |
(((uint16_t)sc->esc_mac.octet[1]) << 8);
sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) |
(((uint16_t)sc->esc_mac.octet[3]) << 8);
sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) |
(((uint16_t)sc->esc_mac.octet[5]) << 8);
/* pci ids */
sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID;
sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL;
sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER;
sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL;
/* fill in the checksum */
checksum = 0;
for (i = 0; i < NVM_CHECKSUM_REG; i++) {
checksum += sc->eeprom_data[i];
}
checksum = NVM_SUM - checksum;
sc->eeprom_data[NVM_CHECKSUM_REG] = checksum;
DPRINTF("eeprom checksum: 0x%x", checksum);
}
static void
e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr,
uint8_t phy_addr, uint32_t data)
{
DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x", reg_addr, phy_addr, data);
}
static uint32_t
e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr,
uint8_t phy_addr)
{
//DPRINTF("Read mdi reg:0x%x phy:0x%x", reg_addr, phy_addr);
switch (reg_addr) {
case PHY_STATUS:
return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS |
MII_SR_AUTONEG_COMPLETE);
case PHY_AUTONEG_ADV:
return NWAY_AR_SELECTOR_FIELD;
case PHY_LP_ABILITY:
return 0;
case PHY_1000T_STATUS:
return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS |
SR_1000T_LOCAL_RX_STATUS);
case PHY_ID1:
return (M88E1011_I_PHY_ID >> 16) & 0xFFFF;
case PHY_ID2:
return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF;
default:
DPRINTF("Unknown mdi read reg:0x%x phy:0x%x", reg_addr, phy_addr);
return 0;
}
/* not reached */
}
static void
e82545_eecd_strobe(struct e82545_softc *sc)
{
/* Microwire state machine */
/*
DPRINTF("eeprom state machine srtobe "
"0x%x 0x%x 0x%x 0x%x",
sc->nvm_mode, sc->nvm_bits,
sc->nvm_opaddr, sc->nvm_data);*/
if (sc->nvm_bits == 0) {
DPRINTF("eeprom state machine not expecting data! "
"0x%x 0x%x 0x%x 0x%x",
sc->nvm_mode, sc->nvm_bits,
sc->nvm_opaddr, sc->nvm_data);
return;
}
sc->nvm_bits--;
if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) {
/* shifting out */
if (sc->nvm_data & 0x8000) {
sc->eeprom_control |= E1000_EECD_DO;
} else {
sc->eeprom_control &= ~E1000_EECD_DO;
}
sc->nvm_data <<= 1;
if (sc->nvm_bits == 0) {
/* read done, back to opcode mode. */
sc->nvm_opaddr = 0;
sc->nvm_mode = E82545_NVM_MODE_OPADDR;
sc->nvm_bits = E82545_NVM_OPADDR_BITS;
}
} else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) {
/* shifting in */
sc->nvm_data <<= 1;
if (sc->eeprom_control & E1000_EECD_DI) {
sc->nvm_data |= 1;
}
if (sc->nvm_bits == 0) {
/* eeprom write */
uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK;
if (op != E82545_NVM_OPCODE_WRITE) {
DPRINTF("Illegal eeprom write op 0x%x",
sc->nvm_opaddr);
} else if (addr >= E82545_NVM_EEPROM_SIZE) {
DPRINTF("Illegal eeprom write addr 0x%x",
sc->nvm_opaddr);
} else {
DPRINTF("eeprom write eeprom[0x%x] = 0x%x",
addr, sc->nvm_data);
sc->eeprom_data[addr] = sc->nvm_data;
}
/* back to opcode mode */
sc->nvm_opaddr = 0;
sc->nvm_mode = E82545_NVM_MODE_OPADDR;
sc->nvm_bits = E82545_NVM_OPADDR_BITS;
}
} else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) {
sc->nvm_opaddr <<= 1;
if (sc->eeprom_control & E1000_EECD_DI) {
sc->nvm_opaddr |= 1;
}
if (sc->nvm_bits == 0) {
uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
switch (op) {
case E82545_NVM_OPCODE_EWEN:
DPRINTF("eeprom write enable: 0x%x",
sc->nvm_opaddr);
/* back to opcode mode */
sc->nvm_opaddr = 0;
sc->nvm_mode = E82545_NVM_MODE_OPADDR;
sc->nvm_bits = E82545_NVM_OPADDR_BITS;
break;
case E82545_NVM_OPCODE_READ:
{
uint16_t addr = sc->nvm_opaddr &
E82545_NVM_ADDR_MASK;
sc->nvm_mode = E82545_NVM_MODE_DATAOUT;
sc->nvm_bits = E82545_NVM_DATA_BITS;
if (addr < E82545_NVM_EEPROM_SIZE) {
sc->nvm_data = sc->eeprom_data[addr];
DPRINTF("eeprom read: eeprom[0x%x] = 0x%x",
addr, sc->nvm_data);
} else {
DPRINTF("eeprom illegal read: 0x%x",
sc->nvm_opaddr);
sc->nvm_data = 0;
}
break;
}
case E82545_NVM_OPCODE_WRITE:
sc->nvm_mode = E82545_NVM_MODE_DATAIN;
sc->nvm_bits = E82545_NVM_DATA_BITS;
sc->nvm_data = 0;
break;
default:
DPRINTF("eeprom unknown op: 0x%x",
sc->nvm_opaddr);
/* back to opcode mode */
sc->nvm_opaddr = 0;
sc->nvm_mode = E82545_NVM_MODE_OPADDR;
sc->nvm_bits = E82545_NVM_OPADDR_BITS;
}
}
} else {
DPRINTF("eeprom state machine wrong state! "
"0x%x 0x%x 0x%x 0x%x",
sc->nvm_mode, sc->nvm_bits,
sc->nvm_opaddr, sc->nvm_data);
}
}
static void
e82545_itr_callback(int fd, enum ev_type type, void *param)
{
uint32_t new;
struct e82545_softc *sc = param;
pthread_mutex_lock(&sc->esc_mtx);
new = sc->esc_ICR & sc->esc_IMS;
if (new && !sc->esc_irq_asserted) {
DPRINTF("itr callback: lintr assert %x", new);
sc->esc_irq_asserted = 1;
pci_lintr_assert(sc->esc_pi);
} else {
mevent_delete(sc->esc_mevpitr);
sc->esc_mevpitr = NULL;
}
pthread_mutex_unlock(&sc->esc_mtx);
}
static void
e82545_icr_assert(struct e82545_softc *sc, uint32_t bits)
{
uint32_t new;
DPRINTF("icr assert: 0x%x", bits);
/*
* An interrupt is only generated if bits are set that
* aren't already in the ICR, these bits are unmasked,
* and there isn't an interrupt already pending.
*/
new = bits & ~sc->esc_ICR & sc->esc_IMS;
sc->esc_ICR |= bits;
if (new == 0) {
DPRINTF("icr assert: masked %x, ims %x", new, sc->esc_IMS);
} else if (sc->esc_mevpitr != NULL) {
DPRINTF("icr assert: throttled %x, ims %x", new, sc->esc_IMS);
} else if (!sc->esc_irq_asserted) {
DPRINTF("icr assert: lintr assert %x", new);
sc->esc_irq_asserted = 1;
pci_lintr_assert(sc->esc_pi);
if (sc->esc_ITR != 0) {
sc->esc_mevpitr = mevent_add(
(sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */
EVF_TIMER, e82545_itr_callback, sc);
}
}
}
static void
e82545_ims_change(struct e82545_softc *sc, uint32_t bits)
{
uint32_t new;
/*
* Changing the mask may allow previously asserted
* but masked interrupt requests to generate an interrupt.
*/
new = bits & sc->esc_ICR & ~sc->esc_IMS;
sc->esc_IMS |= bits;
if (new == 0) {
DPRINTF("ims change: masked %x, ims %x", new, sc->esc_IMS);
} else if (sc->esc_mevpitr != NULL) {
DPRINTF("ims change: throttled %x, ims %x", new, sc->esc_IMS);
} else if (!sc->esc_irq_asserted) {
DPRINTF("ims change: lintr assert %x", new);
sc->esc_irq_asserted = 1;
pci_lintr_assert(sc->esc_pi);
if (sc->esc_ITR != 0) {
sc->esc_mevpitr = mevent_add(
(sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */
EVF_TIMER, e82545_itr_callback, sc);
}
}
}
static void
e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits)
{
DPRINTF("icr deassert: 0x%x", bits);
sc->esc_ICR &= ~bits;
/*
* If there are no longer any interrupt sources and there
* was an asserted interrupt, clear it
*/
if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) {
DPRINTF("icr deassert: lintr deassert %x", bits);
pci_lintr_deassert(sc->esc_pi);
sc->esc_irq_asserted = 0;
}
}
static void
e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value)
{
DPRINTF("intr_write: off %x, val %x", offset, value);
switch (offset) {
case E1000_ICR:
e82545_icr_deassert(sc, value);
break;
case E1000_ITR:
sc->esc_ITR = value;
break;
case E1000_ICS:
sc->esc_ICS = value; /* not used: store for debug */
e82545_icr_assert(sc, value);
break;
case E1000_IMS:
e82545_ims_change(sc, value);
break;
case E1000_IMC:
sc->esc_IMC = value; /* for debug */
sc->esc_IMS &= ~value;
// XXX clear interrupts if all ICR bits now masked
// and interrupt was pending ?
break;
default:
break;
}
}
static uint32_t
e82545_intr_read(struct e82545_softc *sc, uint32_t offset)
{
uint32_t retval;
retval = 0;
DPRINTF("intr_read: off %x", offset);
switch (offset) {
case E1000_ICR:
retval = sc->esc_ICR;
sc->esc_ICR = 0;
e82545_icr_deassert(sc, ~0);
break;
case E1000_ITR:
retval = sc->esc_ITR;
break;
case E1000_ICS:
/* write-only register */
break;
case E1000_IMS:
retval = sc->esc_IMS;
break;
case E1000_IMC:
/* write-only register */
break;
default:
break;
}
return (retval);
}
static void
e82545_devctl(struct e82545_softc *sc, uint32_t val)
{
sc->esc_CTRL = val & ~E1000_CTRL_RST;
if (val & E1000_CTRL_RST) {
DPRINTF("e1k: s/w reset, ctl %x", val);
e82545_reset(sc, 1);
}
/* XXX check for phy reset ? */
}
static void
e82545_rx_update_rdba(struct e82545_softc *sc)
{
/* XXX verify desc base/len within phys mem range */
sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 |
sc->esc_RDBAL;
/* Cache host mapping of guest descriptor array */
sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx,
sc->esc_rdba, sc->esc_RDLEN);
}
static void
e82545_rx_ctl(struct e82545_softc *sc, uint32_t val)
{
int on;
on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN);
/* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */
sc->esc_RCTL = val & ~0xF9204c01;
DPRINTF("rx_ctl - %s RCTL %x, val %x",
on ? "on" : "off", sc->esc_RCTL, val);
/* state change requested */
if (on != sc->esc_rx_enabled) {
if (on) {
/* Catch disallowed/unimplemented settings */
//assert(!(val & E1000_RCTL_LBM_TCVR));
if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) {
sc->esc_rx_loopback = 1;
} else {
sc->esc_rx_loopback = 0;
}
e82545_rx_update_rdba(sc);
e82545_rx_enable(sc);
} else {
e82545_rx_disable(sc);
sc->esc_rx_loopback = 0;
sc->esc_rdba = 0;
sc->esc_rxdesc = NULL;
}
}
}
static void
e82545_tx_update_tdba(struct e82545_softc *sc)
{
/* XXX verify desc base/len within phys mem range */
sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL;
/* Cache host mapping of guest descriptor array */
sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba,
sc->esc_TDLEN);
}
static void
e82545_tx_ctl(struct e82545_softc *sc, uint32_t val)
{
int on;
on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN);
/* ignore TCTL_EN settings that don't change state */
if (on == sc->esc_tx_enabled)
return;
if (on) {
e82545_tx_update_tdba(sc);
e82545_tx_enable(sc);
} else {
e82545_tx_disable(sc);
sc->esc_tdba = 0;
sc->esc_txdesc = NULL;
}
/* Save TCTL value after stripping reserved bits 31:25,23,2,0 */
sc->esc_TCTL = val & ~0xFE800005;
}
int
e82545_bufsz(uint32_t rctl)
{
switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) {
case (E1000_RCTL_SZ_2048): return (2048);
case (E1000_RCTL_SZ_1024): return (1024);
case (E1000_RCTL_SZ_512): return (512);
case (E1000_RCTL_SZ_256): return (256);
case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384);
case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192);
case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096);
}
return (256); /* Forbidden value. */
}
/* XXX one packet at a time until this is debugged */
static void
e82545_rx_callback(int fd, enum ev_type type, void *param)
{
struct e82545_softc *sc = param;
struct e1000_rx_desc *rxd;
struct iovec vec[64];
int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size;
uint32_t cause = 0;
uint16_t *tp, tag, head;
pthread_mutex_lock(&sc->esc_mtx);
DPRINTF("rx_run: head %x, tail %x", sc->esc_RDH, sc->esc_RDT);
if (!sc->esc_rx_enabled || sc->esc_rx_loopback) {
DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped",
sc->esc_rx_enabled, sc->esc_rx_loopback);
while (netbe_rx_discard(sc->esc_be) > 0) {
}
goto done1;
}
bufsz = e82545_bufsz(sc->esc_RCTL);
maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522;
maxpktdesc = (maxpktsz + bufsz - 1) / bufsz;
size = sc->esc_RDLEN / 16;
head = sc->esc_RDH;
left = (size + sc->esc_RDT - head) % size;
if (left < maxpktdesc) {
DPRINTF("rx overflow (%d < %d) -- packet(s) dropped",
left, maxpktdesc);
while (netbe_rx_discard(sc->esc_be) > 0) {
}
goto done1;
}
sc->esc_rx_active = 1;
pthread_mutex_unlock(&sc->esc_mtx);
for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) {
/* Grab rx descriptor pointed to by the head pointer */
for (i = 0; i < maxpktdesc; i++) {
rxd = &sc->esc_rxdesc[(head + i) % size];
vec[i].iov_base = paddr_guest2host(sc->esc_ctx,
rxd->buffer_addr, bufsz);
vec[i].iov_len = bufsz;
}
len = netbe_recv(sc->esc_be, vec, maxpktdesc);
if (len <= 0) {
DPRINTF("netbe_recv() returned %d", len);
goto done;
}
/*
* Adjust the packet length based on whether the CRC needs
* to be stripped or if the packet is less than the minimum
* eth packet size.
*/
if (len < ETHER_MIN_LEN - ETHER_CRC_LEN)
len = ETHER_MIN_LEN - ETHER_CRC_LEN;
if (!(sc->esc_RCTL & E1000_RCTL_SECRC))
len += ETHER_CRC_LEN;
n = (len + bufsz - 1) / bufsz;
DPRINTF("packet read %d bytes, %d segs, head %d",
len, n, head);
/* Apply VLAN filter. */
tp = (uint16_t *)vec[0].iov_base + 6;
if ((sc->esc_RCTL & E1000_RCTL_VFE) &&
(ntohs(tp[0]) == sc->esc_VET)) {
tag = ntohs(tp[1]) & 0x0fff;
if ((sc->esc_fvlan[tag >> 5] &
(1 << (tag & 0x1f))) != 0) {
DPRINTF("known VLAN %d", tag);
} else {
DPRINTF("unknown VLAN %d", tag);
n = 0;
continue;
}
}
/* Update all consumed descriptors. */
for (i = 0; i < n - 1; i++) {
rxd = &sc->esc_rxdesc[(head + i) % size];
rxd->length = bufsz;
rxd->csum = 0;
rxd->errors = 0;
rxd->special = 0;
rxd->status = E1000_RXD_STAT_DD;
}
rxd = &sc->esc_rxdesc[(head + i) % size];
rxd->length = len % bufsz;
rxd->csum = 0;
rxd->errors = 0;
rxd->special = 0;
/* XXX signal no checksum for now */
rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM |
E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD;
/* Schedule receive interrupts. */
if (len <= sc->esc_RSRPD) {
cause |= E1000_ICR_SRPD | E1000_ICR_RXT0;
} else {
/* XXX: RDRT and RADV timers should be here. */
cause |= E1000_ICR_RXT0;
}
head = (head + n) % size;
left -= n;
}
done:
pthread_mutex_lock(&sc->esc_mtx);
sc->esc_rx_active = 0;
if (sc->esc_rx_enabled == 0)
pthread_cond_signal(&sc->esc_rx_cond);
sc->esc_RDH = head;
/* Respect E1000_RCTL_RDMTS */
left = (size + sc->esc_RDT - head) % size;
if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1)))
cause |= E1000_ICR_RXDMT0;
/* Assert all accumulated interrupts. */
if (cause != 0)
e82545_icr_assert(sc, cause);
done1:
DPRINTF("rx_run done: head %x, tail %x", sc->esc_RDH, sc->esc_RDT);
pthread_mutex_unlock(&sc->esc_mtx);
}
static uint16_t
e82545_carry(uint32_t sum)
{
sum = (sum & 0xFFFF) + (sum >> 16);
if (sum > 0xFFFF)
sum -= 0xFFFF;
return (sum);
}
static uint16_t
e82545_buf_checksum(uint8_t *buf, int len)
{
int i;
uint32_t sum = 0;
/* Checksum all the pairs of bytes first... */
for (i = 0; i < (len & ~1U); i += 2)
sum += *((u_int16_t *)(buf + i));
/*
* If there's a single byte left over, checksum it, too.
* Network byte order is big-endian, so the remaining byte is
* the high byte.
*/
if (i < len)
sum += htons(buf[i] << 8);
return (e82545_carry(sum));
}
static uint16_t
e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len)
{
int now, odd;
uint32_t sum = 0, s;
/* Skip completely unneeded vectors. */
while (iovcnt > 0 && iov->iov_len <= off && off > 0) {
off -= iov->iov_len;
iov++;
iovcnt--;
}
/* Calculate checksum of requested range. */
odd = 0;
while (len > 0 && iovcnt > 0) {
now = MIN(len, iov->iov_len - off);
s = e82545_buf_checksum(iov->iov_base + off, now);
sum += odd ? (s << 8) : s;
odd ^= (now & 1);
len -= now;
off = 0;
iov++;
iovcnt--;
}
return (e82545_carry(sum));
}
/*
* Return the transmit descriptor type.
*/
int
e82545_txdesc_type(uint32_t lower)
{
int type;
type = 0;
if (lower & E1000_TXD_CMD_DEXT)
type = lower & E1000_TXD_MASK;
return (type);
}
static void
e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck)
{
uint16_t cksum;
int cklen;
DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d",
iovcnt, ck->ck_start, ck->ck_off, ck->ck_len);
cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX;
cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen);
*(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum;
}
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 803ab0be38bb..b044b43c2f99 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -1,2345 +1,2302 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <ctype.h>
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <assert.h>
#include <stdbool.h>
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
#include <vmmapi.h>
#include "acpi.h"
#include "bhyverun.h"
#include "debug.h"
#include "inout.h"
#include "ioapic.h"
#include "mem.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
+#ifdef BHYVE_SNAPSHOT
+#include "snapshot.h"
+#endif
+
#define CONF1_ADDR_PORT 0x0cf8
#define CONF1_DATA_PORT 0x0cfc
#define CONF1_ENABLE 0x80000000ul
#define MAXBUSES (PCI_BUSMAX + 1)
#define MAXSLOTS (PCI_SLOTMAX + 1)
#define MAXFUNCS (PCI_FUNCMAX + 1)
struct funcinfo {
char *fi_name;
char *fi_param;
struct pci_devinst *fi_devi;
};
struct intxinfo {
int ii_count;
int ii_pirq_pin;
int ii_ioapic_irq;
};
struct slotinfo {
struct intxinfo si_intpins[4];
struct funcinfo si_funcs[MAXFUNCS];
};
struct businfo {
uint16_t iobase, iolimit; /* I/O window */
uint32_t membase32, memlimit32; /* mmio window below 4GB */
uint64_t membase64, memlimit64; /* mmio window above 4GB */
struct slotinfo slotinfo[MAXSLOTS];
};
static struct businfo *pci_businfo[MAXBUSES];
SET_DECLARE(pci_devemu_set, struct pci_devemu);
static uint64_t pci_emul_iobase;
static uint64_t pci_emul_membase32;
static uint64_t pci_emul_membase64;
static uint64_t pci_emul_memlim64;
#define PCI_EMUL_IOBASE 0x2000
#define PCI_EMUL_IOLIMIT 0x10000
#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */
#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */
SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE
static struct pci_devemu *pci_emul_finddev(char *name);
static void pci_lintr_route(struct pci_devinst *pi);
static void pci_lintr_update(struct pci_devinst *pi);
static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
int func, int coff, int bytes, uint32_t *val);
static __inline void
CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes)
{
if (bytes == 1)
pci_set_cfgdata8(pi, coff, val);
else if (bytes == 2)
pci_set_cfgdata16(pi, coff, val);
else
pci_set_cfgdata32(pi, coff, val);
}
static __inline uint32_t
CFGREAD(struct pci_devinst *pi, int coff, int bytes)
{
if (bytes == 1)
return (pci_get_cfgdata8(pi, coff));
else if (bytes == 2)
return (pci_get_cfgdata16(pi, coff));
else
return (pci_get_cfgdata32(pi, coff));
}
/*
* I/O access
*/
/*
* Slot options are in the form:
*
* <bus>:<slot>:<func>,<emul>[,<config>]
* <slot>[:<func>],<emul>[,<config>]
*
* slot is 0..31
* func is 0..7
* emul is a string describing the type of PCI device e.g. virtio-net
* config is an optional string, depending on the device, that can be
* used for configuration.
* Examples are:
* 1,virtio-net,tap0
* 3:0,dummy
*/
static void
pci_parse_slot_usage(char *aopt)
{
EPRINTLN("Invalid PCI slot info field \"%s\"", aopt);
}
int
pci_parse_slot(char *opt)
{
struct businfo *bi;
struct slotinfo *si;
char *emul, *config, *str, *cp;
int error, bnum, snum, fnum;
error = -1;
str = strdup(opt);
emul = config = NULL;
if ((cp = strchr(str, ',')) != NULL) {
*cp = '\0';
emul = cp + 1;
if ((cp = strchr(emul, ',')) != NULL) {
*cp = '\0';
config = cp + 1;
}
} else {
pci_parse_slot_usage(opt);
goto done;
}
/* <bus>:<slot>:<func> */
if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) {
bnum = 0;
/* <slot>:<func> */
if (sscanf(str, "%d:%d", &snum, &fnum) != 2) {
fnum = 0;
/* <slot> */
if (sscanf(str, "%d", &snum) != 1) {
snum = -1;
}
}
}
if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS ||
fnum < 0 || fnum >= MAXFUNCS) {
pci_parse_slot_usage(opt);
goto done;
}
if (pci_businfo[bnum] == NULL)
pci_businfo[bnum] = calloc(1, sizeof(struct businfo));
bi = pci_businfo[bnum];
si = &bi->slotinfo[snum];
if (si->si_funcs[fnum].fi_name != NULL) {
EPRINTLN("pci slot %d:%d already occupied!",
snum, fnum);
goto done;
}
if (pci_emul_finddev(emul) == NULL) {
EPRINTLN("pci slot %d:%d: unknown device \"%s\"",
snum, fnum, emul);
goto done;
}
error = 0;
si->si_funcs[fnum].fi_name = emul;
si->si_funcs[fnum].fi_param = config;
done:
if (error)
free(str);
return (error);
}
void
pci_print_supported_devices()
{
struct pci_devemu **pdpp, *pdp;
SET_FOREACH(pdpp, pci_devemu_set) {
pdp = *pdpp;
printf("%s\n", pdp->pe_emu);
}
}
static int
pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
{
if (offset < pi->pi_msix.pba_offset)
return (0);
if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
return (0);
}
return (1);
}
int
pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
uint64_t value)
{
int msix_entry_offset;
int tab_index;
char *dest;
/* support only 4 or 8 byte writes */
if (size != 4 && size != 8)
return (-1);
/*
* Return if table index is beyond what device supports
*/
tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
if (tab_index >= pi->pi_msix.table_count)
return (-1);
msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
/* support only aligned writes */
if ((msix_entry_offset % size) != 0)
return (-1);
dest = (char *)(pi->pi_msix.table + tab_index);
dest += msix_entry_offset;
if (size == 4)
*((uint32_t *)dest) = value;
else
*((uint64_t *)dest) = value;
return (0);
}
uint64_t
pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
{
char *dest;
int msix_entry_offset;
int tab_index;
uint64_t retval = ~0;
/*
* The PCI standard only allows 4 and 8 byte accesses to the MSI-X
* table but we also allow 1 byte access to accommodate reads from
* ddb.
*/
if (size != 1 && size != 4 && size != 8)
return (retval);
msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
/* support only aligned reads */
if ((msix_entry_offset % size) != 0) {
return (retval);
}
tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
if (tab_index < pi->pi_msix.table_count) {
/* valid MSI-X Table access */
dest = (char *)(pi->pi_msix.table + tab_index);
dest += msix_entry_offset;
if (size == 1)
retval = *((uint8_t *)dest);
else if (size == 4)
retval = *((uint32_t *)dest);
else
retval = *((uint64_t *)dest);
} else if (pci_valid_pba_offset(pi, offset)) {
/* return 0 for PBA access */
retval = 0;
}
return (retval);
}
int
pci_msix_table_bar(struct pci_devinst *pi)
{
if (pi->pi_msix.table != NULL)
return (pi->pi_msix.table_bar);
else
return (-1);
}
int
pci_msix_pba_bar(struct pci_devinst *pi)
{
if (pi->pi_msix.table != NULL)
return (pi->pi_msix.pba_bar);
else
return (-1);
}
static int
pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
struct pci_devinst *pdi = arg;
struct pci_devemu *pe = pdi->pi_d;
uint64_t offset;
int i;
for (i = 0; i <= PCI_BARMAX; i++) {
if (pdi->pi_bar[i].type == PCIBAR_IO &&
port >= pdi->pi_bar[i].addr &&
port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
offset = port - pdi->pi_bar[i].addr;
if (in)
*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
offset, bytes);
else
(*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
bytes, *eax);
return (0);
}
}
return (-1);
}
static int
pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
int size, uint64_t *val, void *arg1, long arg2)
{
struct pci_devinst *pdi = arg1;
struct pci_devemu *pe = pdi->pi_d;
uint64_t offset;
int bidx = (int) arg2;
assert(bidx <= PCI_BARMAX);
assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
pdi->pi_bar[bidx].type == PCIBAR_MEM64);
assert(addr >= pdi->pi_bar[bidx].addr &&
addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
offset = addr - pdi->pi_bar[bidx].addr;
if (dir == MEM_F_WRITE) {
if (size == 8) {
(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
4, *val & 0xffffffff);
(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4,
4, *val >> 32);
} else {
(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
size, *val);
}
} else {
if (size == 8) {
*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
offset, 4);
*val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
offset + 4, 4) << 32;
} else {
*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
offset, size);
}
}
return (0);
}
static int
pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
uint64_t *addr)
{
uint64_t base;
assert((size & (size - 1)) == 0); /* must be a power of 2 */
base = roundup2(*baseptr, size);
if (base + size <= limit) {
*addr = base;
*baseptr = base + size;
return (0);
} else
return (-1);
}
/*
* Register (or unregister) the MMIO or I/O region associated with the BAR
* register 'idx' of an emulated pci device.
*/
static void
modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
{
int error;
struct inout_port iop;
struct mem_range mr;
switch (pi->pi_bar[idx].type) {
case PCIBAR_IO:
bzero(&iop, sizeof(struct inout_port));
iop.name = pi->pi_name;
iop.port = pi->pi_bar[idx].addr;
iop.size = pi->pi_bar[idx].size;
if (registration) {
iop.flags = IOPORT_F_INOUT;
iop.handler = pci_emul_io_handler;
iop.arg = pi;
error = register_inout(&iop);
} else
error = unregister_inout(&iop);
break;
case PCIBAR_MEM32:
case PCIBAR_MEM64:
bzero(&mr, sizeof(struct mem_range));
mr.name = pi->pi_name;
mr.base = pi->pi_bar[idx].addr;
mr.size = pi->pi_bar[idx].size;
if (registration) {
mr.flags = MEM_F_RW;
mr.handler = pci_emul_mem_handler;
mr.arg1 = pi;
mr.arg2 = idx;
error = register_mem(&mr);
} else
error = unregister_mem(&mr);
break;
default:
error = EINVAL;
break;
}
assert(error == 0);
}
static void
unregister_bar(struct pci_devinst *pi, int idx)
{
modify_bar_registration(pi, idx, 0);
}
static void
register_bar(struct pci_devinst *pi, int idx)
{
modify_bar_registration(pi, idx, 1);
}
/* Are we decoding i/o port accesses for the emulated pci device? */
static int
porten(struct pci_devinst *pi)
{
uint16_t cmd;
cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
return (cmd & PCIM_CMD_PORTEN);
}
/* Are we decoding memory accesses for the emulated pci device? */
static int
memen(struct pci_devinst *pi)
{
uint16_t cmd;
cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
return (cmd & PCIM_CMD_MEMEN);
}
/*
* Update the MMIO or I/O address that is decoded by the BAR register.
*
* If the pci device has enabled the address space decoding then intercept
* the address range decoded by the BAR register.
*/
static void
update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type)
{
int decode;
if (pi->pi_bar[idx].type == PCIBAR_IO)
decode = porten(pi);
else
decode = memen(pi);
if (decode)
unregister_bar(pi, idx);
switch (type) {
case PCIBAR_IO:
case PCIBAR_MEM32:
pi->pi_bar[idx].addr = addr;
break;
case PCIBAR_MEM64:
pi->pi_bar[idx].addr &= ~0xffffffffUL;
pi->pi_bar[idx].addr |= addr;
break;
case PCIBAR_MEMHI64:
pi->pi_bar[idx].addr &= 0xffffffff;
pi->pi_bar[idx].addr |= addr;
break;
default:
assert(0);
}
if (decode)
register_bar(pi, idx);
}
int
pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
uint64_t size)
{
int error;
uint64_t *baseptr, limit, addr, mask, lobits, bar;
uint16_t cmd, enbit;
assert(idx >= 0 && idx <= PCI_BARMAX);
if ((size & (size - 1)) != 0)
size = 1UL << flsl(size); /* round up to a power of 2 */
/* Enforce minimum BAR sizes required by the PCI standard */
if (type == PCIBAR_IO) {
if (size < 4)
size = 4;
} else {
if (size < 16)
size = 16;
}
switch (type) {
case PCIBAR_NONE:
baseptr = NULL;
addr = mask = lobits = enbit = 0;
break;
case PCIBAR_IO:
baseptr = &pci_emul_iobase;
limit = PCI_EMUL_IOLIMIT;
mask = PCIM_BAR_IO_BASE;
lobits = PCIM_BAR_IO_SPACE;
enbit = PCIM_CMD_PORTEN;
break;
case PCIBAR_MEM64:
/*
* XXX
* Some drivers do not work well if the 64-bit BAR is allocated
* above 4GB. Allow for this by allocating small requests under
* 4GB unless then allocation size is larger than some arbitrary
* number (128MB currently).
*/
if (size > 128 * 1024 * 1024) {
baseptr = &pci_emul_membase64;
limit = pci_emul_memlim64;
mask = PCIM_BAR_MEM_BASE;
lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
PCIM_BAR_MEM_PREFETCH;
} else {
baseptr = &pci_emul_membase32;
limit = PCI_EMUL_MEMLIMIT32;
mask = PCIM_BAR_MEM_BASE;
lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
}
enbit = PCIM_CMD_MEMEN;
break;
case PCIBAR_MEM32:
baseptr = &pci_emul_membase32;
limit = PCI_EMUL_MEMLIMIT32;
mask = PCIM_BAR_MEM_BASE;
lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
enbit = PCIM_CMD_MEMEN;
break;
default:
printf("pci_emul_alloc_base: invalid bar type %d\n", type);
assert(0);
}
if (baseptr != NULL) {
error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
if (error != 0)
return (error);
}
pdi->pi_bar[idx].type = type;
pdi->pi_bar[idx].addr = addr;
pdi->pi_bar[idx].size = size;
/* Initialize the BAR register in config space */
bar = (addr & mask) | lobits;
pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
if (type == PCIBAR_MEM64) {
assert(idx + 1 <= PCI_BARMAX);
pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
}
cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND);
if ((cmd & enbit) != enbit)
pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit);
register_bar(pdi, idx);
return (0);
}
#define CAP_START_OFFSET 0x40
static int
pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
{
int i, capoff, reallen;
uint16_t sts;
assert(caplen > 0);
reallen = roundup2(caplen, 4); /* dword aligned */
sts = pci_get_cfgdata16(pi, PCIR_STATUS);
if ((sts & PCIM_STATUS_CAPPRESENT) == 0)
capoff = CAP_START_OFFSET;
else
capoff = pi->pi_capend + 1;
/* Check if we have enough space */
if (capoff + reallen > PCI_REGMAX + 1)
return (-1);
/* Set the previous capability pointer */
if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
} else
pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff);
/* Copy the capability */
for (i = 0; i < caplen; i++)
pci_set_cfgdata8(pi, capoff + i, capdata[i]);
/* Set the next capability pointer */
pci_set_cfgdata8(pi, capoff + 1, 0);
pi->pi_prevcap = capoff;
pi->pi_capend = capoff + reallen - 1;
return (0);
}
static struct pci_devemu *
pci_emul_finddev(char *name)
{
struct pci_devemu **pdpp, *pdp;
SET_FOREACH(pdpp, pci_devemu_set) {
pdp = *pdpp;
if (!strcmp(pdp->pe_emu, name)) {
return (pdp);
}
}
return (NULL);
}
static int
pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
int func, struct funcinfo *fi)
{
struct pci_devinst *pdi;
int err;
pdi = calloc(1, sizeof(struct pci_devinst));
pdi->pi_vmctx = ctx;
pdi->pi_bus = bus;
pdi->pi_slot = slot;
pdi->pi_func = func;
pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
pdi->pi_lintr.pin = 0;
pdi->pi_lintr.state = IDLE;
pdi->pi_lintr.pirq_pin = 0;
pdi->pi_lintr.ioapic_irq = 0;
pdi->pi_d = pde;
- snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+ snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d-%d-%d", pde->pe_emu, bus, slot,
+ func);
/* Disable legacy interrupts */
pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN);
err = (*pde->pe_init)(ctx, pdi, fi->fi_param);
if (err == 0)
fi->fi_devi = pdi;
else
free(pdi);
return (err);
}
void
pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
{
int mmc;
/* Number of msi messages must be a power of 2 between 1 and 32 */
assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
mmc = ffs(msgnum) - 1;
bzero(msicap, sizeof(struct msicap));
msicap->capid = PCIY_MSI;
msicap->nextptr = nextptr;
msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
}
int
pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
{
struct msicap msicap;
pci_populate_msicap(&msicap, msgnum, 0);
return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
}
static void
pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
uint32_t msix_tab_size)
{
assert(msix_tab_size % 4096 == 0);
bzero(msixcap, sizeof(struct msixcap));
msixcap->capid = PCIY_MSIX;
/*
* Message Control Register, all fields set to
* zero except for the Table Size.
* Note: Table size N is encoded as N-1
*/
msixcap->msgctrl = msgnum - 1;
/*
* MSI-X BAR setup:
* - MSI-X table start at offset 0
* - PBA table starts at a 4K aligned offset after the MSI-X table
*/
msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
}
static void
pci_msix_table_init(struct pci_devinst *pi, int table_entries)
{
int i, table_size;
assert(table_entries > 0);
assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
pi->pi_msix.table = calloc(1, table_size);
/* set mask bit of vector control register */
for (i = 0; i < table_entries; i++)
pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
}
int
pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
{
uint32_t tab_size;
struct msixcap msixcap;
assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
/* Align table size to nearest 4K */
tab_size = roundup2(tab_size, 4096);
pi->pi_msix.table_bar = barnum;
pi->pi_msix.pba_bar = barnum;
pi->pi_msix.table_offset = 0;
pi->pi_msix.table_count = msgnum;
pi->pi_msix.pba_offset = tab_size;
pi->pi_msix.pba_size = PBA_SIZE(msgnum);
pci_msix_table_init(pi, msgnum);
pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size);
/* allocate memory for MSI-X Table and PBA */
pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
tab_size + pi->pi_msix.pba_size);
return (pci_emul_add_capability(pi, (u_char *)&msixcap,
sizeof(msixcap)));
}
static void
msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val)
{
uint16_t msgctrl, rwmask;
int off;
off = offset - capoff;
/* Message Control Register */
if (off == 2 && bytes == 2) {
rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
msgctrl = pci_get_cfgdata16(pi, offset);
msgctrl &= ~rwmask;
msgctrl |= val & rwmask;
val = msgctrl;
pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
pci_lintr_update(pi);
}
CFGWRITE(pi, offset, val, bytes);
}
static void
msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val)
{
uint16_t msgctrl, rwmask, msgdata, mme;
uint32_t addrlo;
/*
* If guest is writing to the message control register make sure
* we do not overwrite read-only fields.
*/
if ((offset - capoff) == 2 && bytes == 2) {
rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
msgctrl = pci_get_cfgdata16(pi, offset);
msgctrl &= ~rwmask;
msgctrl |= val & rwmask;
val = msgctrl;
}
CFGWRITE(pi, offset, val, bytes);
msgctrl = pci_get_cfgdata16(pi, capoff + 2);
addrlo = pci_get_cfgdata32(pi, capoff + 4);
if (msgctrl & PCIM_MSICTRL_64BIT)
msgdata = pci_get_cfgdata16(pi, capoff + 12);
else
msgdata = pci_get_cfgdata16(pi, capoff + 8);
mme = msgctrl & PCIM_MSICTRL_MME_MASK;
pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
if (pi->pi_msi.enabled) {
pi->pi_msi.addr = addrlo;
pi->pi_msi.msg_data = msgdata;
pi->pi_msi.maxmsgnum = 1 << (mme >> 4);
} else {
pi->pi_msi.maxmsgnum = 0;
}
pci_lintr_update(pi);
}
void
pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val)
{
/* XXX don't write to the readonly parts */
CFGWRITE(pi, offset, val, bytes);
}
#define PCIECAP_VERSION 0x2
int
pci_emul_add_pciecap(struct pci_devinst *pi, int type)
{
int err;
struct pciecap pciecap;
bzero(&pciecap, sizeof(pciecap));
/*
* Use the integrated endpoint type for endpoints on a root complex bus.
*
* NB: bhyve currently only supports a single PCI bus that is the root
* complex bus, so all endpoints are integrated.
*/
if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0))
type = PCIEM_TYPE_ROOT_INT_EP;
pciecap.capid = PCIY_EXPRESS;
pciecap.pcie_capabilities = PCIECAP_VERSION | type;
if (type != PCIEM_TYPE_ROOT_INT_EP) {
pciecap.link_capabilities = 0x411; /* gen1, x1 */
pciecap.link_status = 0x11; /* gen1, x1 */
}
err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
return (err);
}
/*
* This function assumes that 'coff' is in the capabilities region of the
* config space. A capoff parameter of zero will force a search for the
* offset and type.
*/
void
pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val,
uint8_t capoff, int capid)
{
uint8_t nextoff;
/* Do not allow un-aligned writes */
if ((offset & (bytes - 1)) != 0)
return;
if (capoff == 0) {
/* Find the capability that we want to update */
capoff = CAP_START_OFFSET;
while (1) {
nextoff = pci_get_cfgdata8(pi, capoff + 1);
if (nextoff == 0)
break;
if (offset >= capoff && offset < nextoff)
break;
capoff = nextoff;
}
assert(offset >= capoff);
capid = pci_get_cfgdata8(pi, capoff);
}
/*
* Capability ID and Next Capability Pointer are readonly.
* However, some o/s's do 4-byte writes that include these.
* For this case, trim the write back to 2 bytes and adjust
* the data.
*/
if (offset == capoff || offset == capoff + 1) {
if (offset == capoff && bytes == 4) {
bytes = 2;
offset += 2;
val >>= 16;
} else
return;
}
switch (capid) {
case PCIY_MSI:
msicap_cfgwrite(pi, capoff, offset, bytes, val);
break;
case PCIY_MSIX:
msixcap_cfgwrite(pi, capoff, offset, bytes, val);
break;
case PCIY_EXPRESS:
pciecap_cfgwrite(pi, capoff, offset, bytes, val);
break;
default:
break;
}
}
static int
pci_emul_iscap(struct pci_devinst *pi, int offset)
{
uint16_t sts;
sts = pci_get_cfgdata16(pi, PCIR_STATUS);
if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend)
return (1);
}
return (0);
}
static int
pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
int size, uint64_t *val, void *arg1, long arg2)
{
/*
* Ignore writes; return 0xff's for reads. The mem read code
* will take care of truncating to the correct size.
*/
if (dir == MEM_F_READ) {
*val = 0xffffffffffffffff;
}
return (0);
}
static int
pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
int bytes, uint64_t *val, void *arg1, long arg2)
{
int bus, slot, func, coff, in;
coff = addr & 0xfff;
func = (addr >> 12) & 0x7;
slot = (addr >> 15) & 0x1f;
bus = (addr >> 20) & 0xff;
in = (dir == MEM_F_READ);
if (in)
*val = ~0UL;
pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
return (0);
}
uint64_t
pci_ecfg_base(void)
{
return (PCI_EMUL_ECFG_BASE);
}
#define BUSIO_ROUNDUP 32
#define BUSMEM_ROUNDUP (1024 * 1024)
+#ifdef BHYVE_SNAPSHOT
+static int pci_snapshot(struct vm_snapshot_meta *, void *);
+static int pci_pause(struct vmctx *, void *);
+static int pci_resume(struct vmctx *, void *);
+
+static struct snapshot_ops pci_snapshot_ops = {
+ .snapshot_cb = pci_snapshot,
+ .pause_cb = pci_pause,
+ .resume_cb = pci_resume,
+};
+#endif
+
int
init_pci(struct vmctx *ctx)
{
struct mem_range mr;
struct pci_devemu *pde;
struct businfo *bi;
struct slotinfo *si;
struct funcinfo *fi;
size_t lowmem;
uint64_t cpu_maxphysaddr, pci_emul_memresv64;
u_int regs[4];
int bus, slot, func, error;
pci_emul_iobase = PCI_EMUL_IOBASE;
pci_emul_membase32 = vm_get_lowmem_limit(ctx);
do_cpuid(0x80000008, regs);
cpu_maxphysaddr = 1ULL << (regs[0] & 0xff);
if (cpu_maxphysaddr > VM_MAXUSER_ADDRESS_LA48)
cpu_maxphysaddr = VM_MAXUSER_ADDRESS_LA48;
pci_emul_memresv64 = cpu_maxphysaddr / 4;
/*
* Max power of 2 that is less then
* cpu_maxphysaddr - pci_emul_memresv64.
*/
pci_emul_membase64 = 1ULL << (flsl(cpu_maxphysaddr -
pci_emul_memresv64) - 1);
pci_emul_memlim64 = cpu_maxphysaddr;
for (bus = 0; bus < MAXBUSES; bus++) {
if ((bi = pci_businfo[bus]) == NULL)
continue;
/*
* Keep track of the i/o and memory resources allocated to
* this bus.
*/
bi->iobase = pci_emul_iobase;
bi->membase32 = pci_emul_membase32;
bi->membase64 = pci_emul_membase64;
for (slot = 0; slot < MAXSLOTS; slot++) {
si = &bi->slotinfo[slot];
for (func = 0; func < MAXFUNCS; func++) {
fi = &si->si_funcs[func];
if (fi->fi_name == NULL)
continue;
pde = pci_emul_finddev(fi->fi_name);
assert(pde != NULL);
error = pci_emul_init(ctx, pde, bus, slot,
func, fi);
if (error)
return (error);
+#ifdef BHYVE_SNAPSHOT
+ register_snapshot_dev(fi->fi_devi->pi_name, &pci_snapshot_ops,
+ fi->fi_devi);
+#endif
}
}
/*
* Add some slop to the I/O and memory resources decoded by
* this bus to give a guest some flexibility if it wants to
* reprogram the BARs.
*/
pci_emul_iobase += BUSIO_ROUNDUP;
pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP);
bi->iolimit = pci_emul_iobase;
pci_emul_membase32 += BUSMEM_ROUNDUP;
pci_emul_membase32 = roundup2(pci_emul_membase32,
BUSMEM_ROUNDUP);
bi->memlimit32 = pci_emul_membase32;
pci_emul_membase64 += BUSMEM_ROUNDUP;
pci_emul_membase64 = roundup2(pci_emul_membase64,
BUSMEM_ROUNDUP);
bi->memlimit64 = pci_emul_membase64;
}
/*
* PCI backends are initialized before routing INTx interrupts
* so that LPC devices are able to reserve ISA IRQs before
* routing PIRQ pins.
*/
for (bus = 0; bus < MAXBUSES; bus++) {
if ((bi = pci_businfo[bus]) == NULL)
continue;
for (slot = 0; slot < MAXSLOTS; slot++) {
si = &bi->slotinfo[slot];
for (func = 0; func < MAXFUNCS; func++) {
fi = &si->si_funcs[func];
if (fi->fi_devi == NULL)
continue;
pci_lintr_route(fi->fi_devi);
}
}
}
lpc_pirq_routed();
/*
* The guest physical memory map looks like the following:
* [0, lowmem) guest system memory
* [lowmem, lowmem_limit) memory hole (may be absent)
* [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation)
* [0xE0000000, 0xF0000000) PCI extended config window
* [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware
* [4GB, 4GB + highmem)
*/
/*
* Accesses to memory addresses that are not allocated to system
* memory or PCI devices return 0xff's.
*/
lowmem = vm_get_lowmem_size(ctx);
bzero(&mr, sizeof(struct mem_range));
mr.name = "PCI hole";
mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
mr.base = lowmem;
mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
mr.handler = pci_emul_fallback_handler;
error = register_mem_fallback(&mr);
assert(error == 0);
/* PCI extended config space */
bzero(&mr, sizeof(struct mem_range));
mr.name = "PCI ECFG";
mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
mr.base = PCI_EMUL_ECFG_BASE;
mr.size = PCI_EMUL_ECFG_SIZE;
mr.handler = pci_emul_ecfg_handler;
error = register_mem(&mr);
assert(error == 0);
return (0);
}
static void
pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
void *arg)
{
dsdt_line(" Package ()");
dsdt_line(" {");
dsdt_line(" 0x%X,", slot << 16 | 0xffff);
dsdt_line(" 0x%02X,", pin - 1);
dsdt_line(" Zero,");
dsdt_line(" 0x%X", ioapic_irq);
dsdt_line(" },");
}
static void
pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
void *arg)
{
char *name;
name = lpc_pirq_name(pirq_pin);
if (name == NULL)
return;
dsdt_line(" Package ()");
dsdt_line(" {");
dsdt_line(" 0x%X,", slot << 16 | 0xffff);
dsdt_line(" 0x%02X,", pin - 1);
dsdt_line(" %s,", name);
dsdt_line(" 0x00");
dsdt_line(" },");
free(name);
}
/*
* A bhyve virtual machine has a flat PCI hierarchy with a root port
* corresponding to each PCI bus.
*/
static void
pci_bus_write_dsdt(int bus)
{
struct businfo *bi;
struct slotinfo *si;
struct pci_devinst *pi;
int count, func, slot;
/*
* If there are no devices on this 'bus' then just return.
*/
if ((bi = pci_businfo[bus]) == NULL) {
/*
* Bus 0 is special because it decodes the I/O ports used
* for PCI config space access even if there are no devices
* on it.
*/
if (bus != 0)
return;
}
dsdt_line(" Device (PC%02X)", bus);
dsdt_line(" {");
dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))");
dsdt_line(" Method (_BBN, 0, NotSerialized)");
dsdt_line(" {");
dsdt_line(" Return (0x%08X)", bus);
dsdt_line(" }");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, "
"MaxFixed, PosDecode,");
dsdt_line(" 0x0000, // Granularity");
dsdt_line(" 0x%04X, // Range Minimum", bus);
dsdt_line(" 0x%04X, // Range Maximum", bus);
dsdt_line(" 0x0000, // Translation Offset");
dsdt_line(" 0x0001, // Length");
dsdt_line(" ,, )");
if (bus == 0) {
dsdt_indent(3);
dsdt_fixed_ioport(0xCF8, 8);
dsdt_unindent(3);
dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
"PosDecode, EntireRange,");
dsdt_line(" 0x0000, // Granularity");
dsdt_line(" 0x0000, // Range Minimum");
dsdt_line(" 0x0CF7, // Range Maximum");
dsdt_line(" 0x0000, // Translation Offset");
dsdt_line(" 0x0CF8, // Length");
dsdt_line(" ,, , TypeStatic)");
dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
"PosDecode, EntireRange,");
dsdt_line(" 0x0000, // Granularity");
dsdt_line(" 0x0D00, // Range Minimum");
dsdt_line(" 0x%04X, // Range Maximum",
PCI_EMUL_IOBASE - 1);
dsdt_line(" 0x0000, // Translation Offset");
dsdt_line(" 0x%04X, // Length",
PCI_EMUL_IOBASE - 0x0D00);
dsdt_line(" ,, , TypeStatic)");
if (bi == NULL) {
dsdt_line(" })");
goto done;
}
}
assert(bi != NULL);
/* i/o window */
dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
"PosDecode, EntireRange,");
dsdt_line(" 0x0000, // Granularity");
dsdt_line(" 0x%04X, // Range Minimum", bi->iobase);
dsdt_line(" 0x%04X, // Range Maximum",
bi->iolimit - 1);
dsdt_line(" 0x0000, // Translation Offset");
dsdt_line(" 0x%04X, // Length",
bi->iolimit - bi->iobase);
dsdt_line(" ,, , TypeStatic)");
/* mmio window (32-bit) */
dsdt_line(" DWordMemory (ResourceProducer, PosDecode, "
"MinFixed, MaxFixed, NonCacheable, ReadWrite,");
dsdt_line(" 0x00000000, // Granularity");
dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32);
dsdt_line(" 0x%08X, // Range Maximum\n",
bi->memlimit32 - 1);
dsdt_line(" 0x00000000, // Translation Offset");
dsdt_line(" 0x%08X, // Length\n",
bi->memlimit32 - bi->membase32);
dsdt_line(" ,, , AddressRangeMemory, TypeStatic)");
/* mmio window (64-bit) */
dsdt_line(" QWordMemory (ResourceProducer, PosDecode, "
"MinFixed, MaxFixed, NonCacheable, ReadWrite,");
dsdt_line(" 0x0000000000000000, // Granularity");
dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64);
dsdt_line(" 0x%016lX, // Range Maximum\n",
bi->memlimit64 - 1);
dsdt_line(" 0x0000000000000000, // Translation Offset");
dsdt_line(" 0x%016lX, // Length\n",
bi->memlimit64 - bi->membase64);
dsdt_line(" ,, , AddressRangeMemory, TypeStatic)");
dsdt_line(" })");
count = pci_count_lintr(bus);
if (count != 0) {
dsdt_indent(2);
dsdt_line("Name (PPRT, Package ()");
dsdt_line("{");
pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
dsdt_line("})");
dsdt_line("Name (APRT, Package ()");
dsdt_line("{");
pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
dsdt_line("})");
dsdt_line("Method (_PRT, 0, NotSerialized)");
dsdt_line("{");
dsdt_line(" If (PICM)");
dsdt_line(" {");
dsdt_line(" Return (APRT)");
dsdt_line(" }");
dsdt_line(" Else");
dsdt_line(" {");
dsdt_line(" Return (PPRT)");
dsdt_line(" }");
dsdt_line("}");
dsdt_unindent(2);
}
dsdt_indent(2);
for (slot = 0; slot < MAXSLOTS; slot++) {
si = &bi->slotinfo[slot];
for (func = 0; func < MAXFUNCS; func++) {
pi = si->si_funcs[func].fi_devi;
if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL)
pi->pi_d->pe_write_dsdt(pi);
}
}
dsdt_unindent(2);
done:
dsdt_line(" }");
}
void
pci_write_dsdt(void)
{
int bus;
dsdt_indent(1);
dsdt_line("Name (PICM, 0x00)");
dsdt_line("Method (_PIC, 1, NotSerialized)");
dsdt_line("{");
dsdt_line(" Store (Arg0, PICM)");
dsdt_line("}");
dsdt_line("");
dsdt_line("Scope (_SB)");
dsdt_line("{");
for (bus = 0; bus < MAXBUSES; bus++)
pci_bus_write_dsdt(bus);
dsdt_line("}");
dsdt_unindent(1);
}
int
pci_bus_configured(int bus)
{
assert(bus >= 0 && bus < MAXBUSES);
return (pci_businfo[bus] != NULL);
}
int
pci_msi_enabled(struct pci_devinst *pi)
{
return (pi->pi_msi.enabled);
}
int
pci_msi_maxmsgnum(struct pci_devinst *pi)
{
if (pi->pi_msi.enabled)
return (pi->pi_msi.maxmsgnum);
else
return (0);
}
int
pci_msix_enabled(struct pci_devinst *pi)
{
return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
}
void
pci_generate_msix(struct pci_devinst *pi, int index)
{
struct msix_table_entry *mte;
if (!pci_msix_enabled(pi))
return;
if (pi->pi_msix.function_mask)
return;
if (index >= pi->pi_msix.table_count)
return;
mte = &pi->pi_msix.table[index];
if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
/* XXX Set PBA bit if interrupt is disabled */
vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data);
}
}
void
pci_generate_msi(struct pci_devinst *pi, int index)
{
if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) {
vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr,
pi->pi_msi.msg_data + index);
}
}
static bool
pci_lintr_permitted(struct pci_devinst *pi)
{
uint16_t cmd;
cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
return (!(pi->pi_msi.enabled || pi->pi_msix.enabled ||
(cmd & PCIM_CMD_INTxDIS)));
}
void
pci_lintr_request(struct pci_devinst *pi)
{
struct businfo *bi;
struct slotinfo *si;
int bestpin, bestcount, pin;
bi = pci_businfo[pi->pi_bus];
assert(bi != NULL);
/*
* Just allocate a pin from our slot. The pin will be
* assigned IRQs later when interrupts are routed.
*/
si = &bi->slotinfo[pi->pi_slot];
bestpin = 0;
bestcount = si->si_intpins[0].ii_count;
for (pin = 1; pin < 4; pin++) {
if (si->si_intpins[pin].ii_count < bestcount) {
bestpin = pin;
bestcount = si->si_intpins[pin].ii_count;
}
}
si->si_intpins[bestpin].ii_count++;
pi->pi_lintr.pin = bestpin + 1;
pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
}
static void
pci_lintr_route(struct pci_devinst *pi)
{
struct businfo *bi;
struct intxinfo *ii;
if (pi->pi_lintr.pin == 0)
return;
bi = pci_businfo[pi->pi_bus];
assert(bi != NULL);
ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
/*
* Attempt to allocate an I/O APIC pin for this intpin if one
* is not yet assigned.
*/
if (ii->ii_ioapic_irq == 0)
ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi);
assert(ii->ii_ioapic_irq > 0);
/*
* Attempt to allocate a PIRQ pin for this intpin if one is
* not yet assigned.
*/
if (ii->ii_pirq_pin == 0)
ii->ii_pirq_pin = pirq_alloc_pin(pi);
assert(ii->ii_pirq_pin > 0);
pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
pi->pi_lintr.pirq_pin = ii->ii_pirq_pin;
pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin));
}
void
pci_lintr_assert(struct pci_devinst *pi)
{
assert(pi->pi_lintr.pin > 0);
pthread_mutex_lock(&pi->pi_lintr.lock);
if (pi->pi_lintr.state == IDLE) {
if (pci_lintr_permitted(pi)) {
pi->pi_lintr.state = ASSERTED;
pci_irq_assert(pi);
} else
pi->pi_lintr.state = PENDING;
}
pthread_mutex_unlock(&pi->pi_lintr.lock);
}
void
pci_lintr_deassert(struct pci_devinst *pi)
{
assert(pi->pi_lintr.pin > 0);
pthread_mutex_lock(&pi->pi_lintr.lock);
if (pi->pi_lintr.state == ASSERTED) {
pi->pi_lintr.state = IDLE;
pci_irq_deassert(pi);
} else if (pi->pi_lintr.state == PENDING)
pi->pi_lintr.state = IDLE;
pthread_mutex_unlock(&pi->pi_lintr.lock);
}
static void
pci_lintr_update(struct pci_devinst *pi)
{
pthread_mutex_lock(&pi->pi_lintr.lock);
if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
pci_irq_deassert(pi);
pi->pi_lintr.state = PENDING;
} else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
pi->pi_lintr.state = ASSERTED;
pci_irq_assert(pi);
}
pthread_mutex_unlock(&pi->pi_lintr.lock);
}
int
pci_count_lintr(int bus)
{
int count, slot, pin;
struct slotinfo *slotinfo;
count = 0;
if (pci_businfo[bus] != NULL) {
for (slot = 0; slot < MAXSLOTS; slot++) {
slotinfo = &pci_businfo[bus]->slotinfo[slot];
for (pin = 0; pin < 4; pin++) {
if (slotinfo->si_intpins[pin].ii_count != 0)
count++;
}
}
}
return (count);
}
void
pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
{
struct businfo *bi;
struct slotinfo *si;
struct intxinfo *ii;
int slot, pin;
if ((bi = pci_businfo[bus]) == NULL)
return;
for (slot = 0; slot < MAXSLOTS; slot++) {
si = &bi->slotinfo[slot];
for (pin = 0; pin < 4; pin++) {
ii = &si->si_intpins[pin];
if (ii->ii_count != 0)
cb(bus, slot, pin + 1, ii->ii_pirq_pin,
ii->ii_ioapic_irq, arg);
}
}
}
/*
* Return 1 if the emulated device in 'slot' is a multi-function device.
* Return 0 otherwise.
*/
static int
pci_emul_is_mfdev(int bus, int slot)
{
struct businfo *bi;
struct slotinfo *si;
int f, numfuncs;
numfuncs = 0;
if ((bi = pci_businfo[bus]) != NULL) {
si = &bi->slotinfo[slot];
for (f = 0; f < MAXFUNCS; f++) {
if (si->si_funcs[f].fi_devi != NULL) {
numfuncs++;
}
}
}
return (numfuncs > 1);
}
/*
* Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
* whether or not is a multi-function being emulated in the pci 'slot'.
*/
static void
pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
{
int mfdev;
if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
mfdev = pci_emul_is_mfdev(bus, slot);
switch (bytes) {
case 1:
case 2:
*rv &= ~PCIM_MFDEV;
if (mfdev) {
*rv |= PCIM_MFDEV;
}
break;
case 4:
*rv &= ~(PCIM_MFDEV << 16);
if (mfdev) {
*rv |= (PCIM_MFDEV << 16);
}
break;
}
}
}
/*
* Update device state in response to changes to the PCI command
* register.
*/
void
pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old)
{
int i;
uint16_t changed, new;
new = pci_get_cfgdata16(pi, PCIR_COMMAND);
changed = old ^ new;
/*
* If the MMIO or I/O address space decoding has changed then
* register/unregister all BARs that decode that address space.
*/
for (i = 0; i <= PCI_BARMAX; i++) {
switch (pi->pi_bar[i].type) {
case PCIBAR_NONE:
case PCIBAR_MEMHI64:
break;
case PCIBAR_IO:
/* I/O address space decoding changed? */
if (changed & PCIM_CMD_PORTEN) {
if (new & PCIM_CMD_PORTEN)
register_bar(pi, i);
else
unregister_bar(pi, i);
}
break;
case PCIBAR_MEM32:
case PCIBAR_MEM64:
/* MMIO address space decoding changed? */
if (changed & PCIM_CMD_MEMEN) {
if (new & PCIM_CMD_MEMEN)
register_bar(pi, i);
else
unregister_bar(pi, i);
}
break;
default:
assert(0);
}
}
/*
* If INTx has been unmasked and is pending, assert the
* interrupt.
*/
pci_lintr_update(pi);
}
static void
pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
{
int rshift;
uint32_t cmd, old, readonly;
cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */
/*
* From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
*
* XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
* 'write 1 to clear'. However these bits are not set to '1' by
* any device emulation so it is simpler to treat them as readonly.
*/
rshift = (coff & 0x3) * 8;
readonly = 0xFFFFF880 >> rshift;
old = CFGREAD(pi, coff, bytes);
new &= ~readonly;
new |= (old & readonly);
CFGWRITE(pi, coff, new, bytes); /* update config */
pci_emul_cmd_changed(pi, cmd);
}
static void
pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
int coff, int bytes, uint32_t *eax)
{
struct businfo *bi;
struct slotinfo *si;
struct pci_devinst *pi;
struct pci_devemu *pe;
int idx, needcfg;
uint64_t addr, bar, mask;
if ((bi = pci_businfo[bus]) != NULL) {
si = &bi->slotinfo[slot];
pi = si->si_funcs[func].fi_devi;
} else
pi = NULL;
/*
* Just return if there is no device at this slot:func or if the
* the guest is doing an un-aligned access.
*/
if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
(coff & (bytes - 1)) != 0) {
if (in)
*eax = 0xffffffff;
return;
}
/*
* Ignore all writes beyond the standard config space and return all
* ones on reads.
*/
if (coff >= PCI_REGMAX + 1) {
if (in) {
*eax = 0xffffffff;
/*
* Extended capabilities begin at offset 256 in config
* space. Absence of extended capabilities is signaled
* with all 0s in the extended capability header at
* offset 256.
*/
if (coff <= PCI_REGMAX + 4)
*eax = 0x00000000;
}
return;
}
pe = pi->pi_d;
/*
* Config read
*/
if (in) {
/* Let the device emulation override the default handler */
if (pe->pe_cfgread != NULL) {
needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
eax);
} else {
needcfg = 1;
}
if (needcfg)
*eax = CFGREAD(pi, coff, bytes);
pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
} else {
/* Let the device emulation override the default handler */
if (pe->pe_cfgwrite != NULL &&
(*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
return;
/*
* Special handling for write to BAR registers
*/
if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
/*
* Ignore writes to BAR registers that are not
* 4-byte aligned.
*/
if (bytes != 4 || (coff & 0x3) != 0)
return;
idx = (coff - PCIR_BAR(0)) / 4;
mask = ~(pi->pi_bar[idx].size - 1);
switch (pi->pi_bar[idx].type) {
case PCIBAR_NONE:
pi->pi_bar[idx].addr = bar = 0;
break;
case PCIBAR_IO:
addr = *eax & mask;
addr &= 0xffff;
bar = addr | PCIM_BAR_IO_SPACE;
/*
* Register the new BAR value for interception
*/
if (addr != pi->pi_bar[idx].addr) {
update_bar_address(pi, addr, idx,
PCIBAR_IO);
}
break;
case PCIBAR_MEM32:
addr = bar = *eax & mask;
bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
if (addr != pi->pi_bar[idx].addr) {
update_bar_address(pi, addr, idx,
PCIBAR_MEM32);
}
break;
case PCIBAR_MEM64:
addr = bar = *eax & mask;
bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
PCIM_BAR_MEM_PREFETCH;
if (addr != (uint32_t)pi->pi_bar[idx].addr) {
update_bar_address(pi, addr, idx,
PCIBAR_MEM64);
}
break;
case PCIBAR_MEMHI64:
mask = ~(pi->pi_bar[idx - 1].size - 1);
addr = ((uint64_t)*eax << 32) & mask;
bar = addr >> 32;
if (bar != pi->pi_bar[idx - 1].addr >> 32) {
update_bar_address(pi, addr, idx - 1,
PCIBAR_MEMHI64);
}
break;
default:
assert(0);
}
pci_set_cfgdata32(pi, coff, bar);
} else if (pci_emul_iscap(pi, coff)) {
pci_emul_capwrite(pi, coff, bytes, *eax, 0, 0);
} else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) {
pci_emul_cmdsts_write(pi, coff, *eax, bytes);
} else {
CFGWRITE(pi, coff, *eax, bytes);
}
}
}
static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
static int
pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
uint32_t x;
if (bytes != 4) {
if (in)
*eax = (bytes == 2) ? 0xffff : 0xff;
return (0);
}
if (in) {
x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
if (cfgenable)
x |= CONF1_ENABLE;
*eax = x;
} else {
x = *eax;
cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
cfgoff = x & PCI_REGMAX;
cfgfunc = (x >> 8) & PCI_FUNCMAX;
cfgslot = (x >> 11) & PCI_SLOTMAX;
cfgbus = (x >> 16) & PCI_BUSMAX;
}
return (0);
}
INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
static int
pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int coff;
assert(bytes == 1 || bytes == 2 || bytes == 4);
coff = cfgoff + (port - CONF1_DATA_PORT);
if (cfgenable) {
pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
eax);
} else {
/* Ignore accesses to cfgdata if not enabled by cfgaddr */
if (in)
*eax = 0xffffffff;
}
return (0);
}
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
#ifdef BHYVE_SNAPSHOT
/*
* Saves/restores PCI device emulated state. Returns 0 on success.
*/
static int
pci_snapshot_pci_dev(struct vm_snapshot_meta *meta)
{
struct pci_devinst *pi;
int i;
int ret;
pi = meta->dev_data;
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_page_offset, meta, ret, done);
SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata),
meta, ret, done);
for (i = 0; i < nitems(pi->pi_bar); i++) {
SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done);
}
/* Restore MSI-X table. */
for (i = 0; i < pi->pi_msix.table_count; i++) {
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr,
meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data,
meta, ret, done);
SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control,
meta, ret, done);
}
done:
return (ret);
}
static int
-pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde,
- struct pci_devinst **pdi)
-{
- struct businfo *bi;
- struct slotinfo *si;
- struct funcinfo *fi;
- int bus, slot, func;
-
- assert(dev_name != NULL);
- assert(pde != NULL);
- assert(pdi != NULL);
-
- for (bus = 0; bus < MAXBUSES; bus++) {
- if ((bi = pci_businfo[bus]) == NULL)
- continue;
-
- for (slot = 0; slot < MAXSLOTS; slot++) {
- si = &bi->slotinfo[slot];
- for (func = 0; func < MAXFUNCS; func++) {
- fi = &si->si_funcs[func];
- if (fi->fi_name == NULL)
- continue;
- if (strcmp(dev_name, fi->fi_name))
- continue;
-
- *pde = pci_emul_finddev(fi->fi_name);
- assert(*pde != NULL);
-
- *pdi = fi->fi_devi;
- return (0);
- }
- }
- }
-
- return (EINVAL);
-}
-
-int
-pci_snapshot(struct vm_snapshot_meta *meta)
+pci_snapshot(struct vm_snapshot_meta *meta, void *cbdata)
{
struct pci_devemu *pde;
- struct pci_devinst *pdi;
+ struct pci_devinst *pdi = (struct pci_devinst *)cbdata;
int ret;
- assert(meta->dev_name != NULL);
-
- ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi);
- if (ret != 0) {
- fprintf(stderr, "%s: no such name: %s\r\n",
- __func__, meta->dev_name);
- memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
- return (0);
- }
+ assert(pdi != NULL);
+ pde = pdi->pi_d;
meta->dev_data = pdi;
if (pde->pe_snapshot == NULL) {
fprintf(stderr, "%s: not implemented yet for: %s\r\n",
__func__, meta->dev_name);
return (-1);
}
ret = pci_snapshot_pci_dev(meta);
if (ret != 0) {
fprintf(stderr, "%s: failed to snapshot pci dev\r\n",
__func__);
return (-1);
}
ret = (*pde->pe_snapshot)(meta);
return (ret);
}
-int
-pci_pause(struct vmctx *ctx, const char *dev_name)
+static int
+pci_pause(struct vmctx *ctx, void *cbdata)
{
struct pci_devemu *pde;
- struct pci_devinst *pdi;
- int ret;
+ struct pci_devinst *pdi = (struct pci_devinst *)cbdata;
- assert(dev_name != NULL);
+ assert(pdi != NULL);
- ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
- if (ret != 0) {
- /*
- * It is possible to call this function without
- * checking that the device is inserted first.
- */
- fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
- return (0);
- }
+ pde = pdi->pi_d;
if (pde->pe_pause == NULL) {
/* The pause/resume functionality is optional. */
fprintf(stderr, "%s: not implemented for: %s\n",
- __func__, dev_name);
+ __func__, pdi->pi_name);
return (0);
}
return (*pde->pe_pause)(ctx, pdi);
}
-int
-pci_resume(struct vmctx *ctx, const char *dev_name)
+static int
+pci_resume(struct vmctx *ctx, void *cbdata)
{
struct pci_devemu *pde;
- struct pci_devinst *pdi;
- int ret;
+ struct pci_devinst *pdi = (struct pci_devinst *)cbdata;
- assert(dev_name != NULL);
+ assert(pdi != NULL);
- ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
- if (ret != 0) {
- /*
- * It is possible to call this function without
- * checking that the device is inserted first.
- */
- fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
- return (0);
- }
+ pde = pdi->pi_d;
if (pde->pe_resume == NULL) {
/* The pause/resume functionality is optional. */
fprintf(stderr, "%s: not implemented for: %s\n",
- __func__, dev_name);
+ __func__, pdi->pi_name);
return (0);
}
return (*pde->pe_resume)(ctx, pdi);
}
#endif
#define PCI_EMUL_TEST
#ifdef PCI_EMUL_TEST
/*
* Define a dummy test device
*/
#define DIOSZ 8
#define DMEMSZ 4096
struct pci_emul_dsoftc {
uint8_t ioregs[DIOSZ];
uint8_t memregs[2][DMEMSZ];
};
#define PCI_EMUL_MSI_MSGS 4
#define PCI_EMUL_MSIX_MSGS 16
static int
pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
int error;
struct pci_emul_dsoftc *sc;
sc = calloc(1, sizeof(struct pci_emul_dsoftc));
pi->pi_arg = sc;
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
assert(error == 0);
error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
assert(error == 0);
error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
assert(error == 0);
error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ);
assert(error == 0);
return (0);
}
static void
pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
uint64_t offset, int size, uint64_t value)
{
int i;
struct pci_emul_dsoftc *sc = pi->pi_arg;
if (baridx == 0) {
if (offset + size > DIOSZ) {
printf("diow: iow too large, offset %ld size %d\n",
offset, size);
return;
}
if (size == 1) {
sc->ioregs[offset] = value & 0xff;
} else if (size == 2) {
*(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
} else if (size == 4) {
*(uint32_t *)&sc->ioregs[offset] = value;
} else {
printf("diow: iow unknown size %d\n", size);
}
/*
* Special magic value to generate an interrupt
*/
if (offset == 4 && size == 4 && pci_msi_enabled(pi))
pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi));
if (value == 0xabcdef) {
for (i = 0; i < pci_msi_maxmsgnum(pi); i++)
pci_generate_msi(pi, i);
}
}
if (baridx == 1 || baridx == 2) {
if (offset + size > DMEMSZ) {
printf("diow: memw too large, offset %ld size %d\n",
offset, size);
return;
}
i = baridx - 1; /* 'memregs' index */
if (size == 1) {
sc->memregs[i][offset] = value;
} else if (size == 2) {
*(uint16_t *)&sc->memregs[i][offset] = value;
} else if (size == 4) {
*(uint32_t *)&sc->memregs[i][offset] = value;
} else if (size == 8) {
*(uint64_t *)&sc->memregs[i][offset] = value;
} else {
printf("diow: memw unknown size %d\n", size);
}
/*
* magic interrupt ??
*/
}
if (baridx > 2 || baridx < 0) {
printf("diow: unknown bar idx %d\n", baridx);
}
}
static uint64_t
pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
uint64_t offset, int size)
{
struct pci_emul_dsoftc *sc = pi->pi_arg;
uint32_t value;
int i;
if (baridx == 0) {
if (offset + size > DIOSZ) {
printf("dior: ior too large, offset %ld size %d\n",
offset, size);
return (0);
}
value = 0;
if (size == 1) {
value = sc->ioregs[offset];
} else if (size == 2) {
value = *(uint16_t *) &sc->ioregs[offset];
} else if (size == 4) {
value = *(uint32_t *) &sc->ioregs[offset];
} else {
printf("dior: ior unknown size %d\n", size);
}
}
if (baridx == 1 || baridx == 2) {
if (offset + size > DMEMSZ) {
printf("dior: memr too large, offset %ld size %d\n",
offset, size);
return (0);
}
i = baridx - 1; /* 'memregs' index */
if (size == 1) {
value = sc->memregs[i][offset];
} else if (size == 2) {
value = *(uint16_t *) &sc->memregs[i][offset];
} else if (size == 4) {
value = *(uint32_t *) &sc->memregs[i][offset];
} else if (size == 8) {
value = *(uint64_t *) &sc->memregs[i][offset];
} else {
printf("dior: ior unknown size %d\n", size);
}
}
if (baridx > 2 || baridx < 0) {
printf("dior: unknown bar idx %d\n", baridx);
return (0);
}
return (value);
}
#ifdef BHYVE_SNAPSHOT
-int
+static int
pci_emul_snapshot(struct vm_snapshot_meta *meta)
{
-
return (0);
}
#endif
struct pci_devemu pci_dummy = {
.pe_emu = "dummy",
.pe_init = pci_emul_dinit,
.pe_barwrite = pci_emul_diow,
.pe_barread = pci_emul_dior,
#ifdef BHYVE_SNAPSHOT
.pe_snapshot = pci_emul_snapshot,
#endif
};
PCI_EMUL_SET(pci_dummy);
#endif /* PCI_EMUL_TEST */
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index 3e9e95a74b47..b8a8506b673b 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -1,299 +1,294 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PCI_EMUL_H_
#define _PCI_EMUL_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/_pthreadtypes.h>
#include <dev/pci/pcireg.h>
#include <assert.h>
#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
struct vmctx;
struct pci_devinst;
struct memory_region;
struct vm_snapshot_meta;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
/* instance creation */
int (*pe_init)(struct vmctx *, struct pci_devinst *,
char *opts);
/* ACPI DSDT enumeration */
void (*pe_write_dsdt)(struct pci_devinst *);
/* config space read/write callbacks */
int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t val);
int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t *retval);
/* BAR read/write callbacks */
void (*pe_barwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size, uint64_t value);
uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size);
/* Save/restore device state */
int (*pe_snapshot)(struct vm_snapshot_meta *meta);
int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi);
int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi);
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
enum pcibar_type {
PCIBAR_NONE,
PCIBAR_IO,
PCIBAR_MEM32,
PCIBAR_MEM64,
PCIBAR_MEMHI64
};
struct pcibar {
enum pcibar_type type; /* io or memory */
uint64_t size;
uint64_t addr;
};
#define PI_NAMESZ 40
struct msix_table_entry {
uint64_t addr;
uint32_t msg_data;
uint32_t vector_control;
} __packed;
/*
* In case the structure is modified to hold extra information, use a define
* for the size that should be emulated.
*/
#define MSIX_TABLE_ENTRY_SIZE 16
#define MAX_MSIX_TABLE_ENTRIES 2048
#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8)
enum lintr_stat {
IDLE,
ASSERTED,
PENDING
};
struct pci_devinst {
struct pci_devemu *pi_d;
struct vmctx *pi_vmctx;
uint8_t pi_bus, pi_slot, pi_func;
char pi_name[PI_NAMESZ];
int pi_bar_getsize;
int pi_prevcap;
int pi_capend;
struct {
int8_t pin;
enum lintr_stat state;
int pirq_pin;
int ioapic_irq;
pthread_mutex_t lock;
} pi_lintr;
struct {
int enabled;
uint64_t addr;
uint64_t msg_data;
int maxmsgnum;
} pi_msi;
struct {
int enabled;
int table_bar;
int pba_bar;
uint32_t table_offset;
int table_count;
uint32_t pba_offset;
int pba_size;
int function_mask;
struct msix_table_entry *table; /* allocated at runtime */
void *pba_page;
int pba_page_offset;
} pi_msix;
void *pi_arg; /* devemu-private data */
u_char pi_cfgdata[PCI_REGMAX + 1];
struct pcibar pi_bar[PCI_BARMAX + 1];
};
struct msicap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t addrlo;
uint32_t addrhi;
uint16_t msgdata;
} __packed;
static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed");
struct msixcap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t table_info; /* bar index and offset within it */
uint32_t pba_info; /* bar index and offset within it */
} __packed;
static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed");
struct pciecap {
uint8_t capid;
uint8_t nextptr;
uint16_t pcie_capabilities;
uint32_t dev_capabilities; /* all devices */
uint16_t dev_control;
uint16_t dev_status;
uint32_t link_capabilities; /* devices with links */
uint16_t link_control;
uint16_t link_status;
uint32_t slot_capabilities; /* ports with slots */
uint16_t slot_control;
uint16_t slot_status;
uint16_t root_control; /* root ports */
uint16_t root_capabilities;
uint32_t root_status;
uint32_t dev_capabilities2; /* all devices */
uint16_t dev_control2;
uint16_t dev_status2;
uint32_t link_capabilities2; /* devices with links */
uint16_t link_control2;
uint16_t link_status2;
uint32_t slot_capabilities2; /* ports with slots */
uint16_t slot_control2;
uint16_t slot_status2;
} __packed;
static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed");
typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
int ioapic_irq, void *arg);
int init_pci(struct vmctx *ctx);
void pci_callback(void);
int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
enum pcibar_type type, uint64_t size);
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes,
uint32_t val, uint8_t capoff, int capid);
void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old);
void pci_generate_msi(struct pci_devinst *pi, int msgnum);
void pci_generate_msix(struct pci_devinst *pi, int msgnum);
void pci_lintr_assert(struct pci_devinst *pi);
void pci_lintr_deassert(struct pci_devinst *pi);
void pci_lintr_request(struct pci_devinst *pi);
int pci_msi_enabled(struct pci_devinst *pi);
int pci_msix_enabled(struct pci_devinst *pi);
int pci_msix_table_bar(struct pci_devinst *pi);
int pci_msix_pba_bar(struct pci_devinst *pi);
int pci_msi_maxmsgnum(struct pci_devinst *pi);
int pci_parse_slot(char *opt);
void pci_print_supported_devices();
void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
uint64_t value);
uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
int pci_count_lintr(int bus);
void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
void pci_write_dsdt(void);
uint64_t pci_ecfg_base(void);
int pci_bus_configured(int bus);
-#ifdef BHYVE_SNAPSHOT
-int pci_snapshot(struct vm_snapshot_meta *meta);
-int pci_pause(struct vmctx *ctx, const char *dev_name);
-int pci_resume(struct vmctx *ctx, const char *dev_name);
-#endif
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
{
assert(offset <= PCI_REGMAX);
*(uint8_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
*(uint16_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
*(uint32_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline uint8_t
pci_get_cfgdata8(struct pci_devinst *pi, int offset)
{
assert(offset <= PCI_REGMAX);
return (*(uint8_t *)(pi->pi_cfgdata + offset));
}
static __inline uint16_t
pci_get_cfgdata16(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
return (*(uint16_t *)(pi->pi_cfgdata + offset));
}
static __inline uint32_t
pci_get_cfgdata32(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
return (*(uint32_t *)(pi->pi_cfgdata + offset));
}
#endif /* _PCI_EMUL_H_ */
diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c
index 559496a9fee0..e2611900ed61 100644
--- a/usr.sbin/bhyve/pci_hostbridge.c
+++ b/usr.sbin/bhyve/pci_hostbridge.c
@@ -1,72 +1,86 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "pci_emul.h"
static int
pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
/* config space */
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT);
return (0);
}
static int
pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
(void) pci_hostbridge_init(ctx, pi, opts);
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */
return (0);
}
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_hostbridge_snapshot(struct vm_snapshot_meta *meta)
+{
+ return (0);
+}
+#endif
+
struct pci_devemu pci_de_amd_hostbridge = {
.pe_emu = "amd_hostbridge",
.pe_init = pci_amd_hostbridge_init,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_hostbridge_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_amd_hostbridge);
struct pci_devemu pci_de_hostbridge = {
.pe_emu = "hostbridge",
.pe_init = pci_hostbridge_init,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_hostbridge_snapshot,
+#endif
};
PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c
index 0ad03afd85c0..9a893625f335 100644
--- a/usr.sbin/bhyve/snapshot.c
+++ b/usr.sbin/bhyve/snapshot.c
@@ -1,1743 +1,1748 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2016 Flavius Anton
* Copyright (c) 2016 Mihai Tiganus
* Copyright (c) 2016-2019 Mihai Carabas
* Copyright (c) 2017-2019 Darius Mihai
* Copyright (c) 2017-2019 Elena Mihailescu
* Copyright (c) 2018-2019 Sergiu Weisz
* All rights reserved.
* The bhyve-snapshot feature was developed under sponsorships
* from Matthew Grooms.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#ifndef WITHOUT_CAPSICUM
#include <sys/capsicum.h>
#endif
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/un.h>
#include <machine/atomic.h>
#include <machine/segments.h>
#ifndef WITHOUT_CAPSICUM
#include <capsicum_helpers.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <signal.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <pthread_np.h>
#include <sysexits.h>
#include <stdbool.h>
#include <sys/ioctl.h>
#include <machine/vmm.h>
#ifndef WITHOUT_CAPSICUM
#include <machine/vmm_dev.h>
#endif
#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
#include "bhyverun.h"
#include "acpi.h"
#include "atkbdc.h"
#include "inout.h"
#include "dbgport.h"
#include "fwctl.h"
#include "ioapic.h"
#include "mem.h"
#include "mevent.h"
#include "mptbl.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
#include "smbiostbl.h"
#include "snapshot.h"
#include "xmsr.h"
#include "spinup_ap.h"
#include "rtc.h"
#include <libxo/xo.h>
#include <ucl.h>
struct spinner_info {
const size_t *crtval;
const size_t maxval;
const size_t total;
};
extern int guest_ncpus;
static struct winsize winsize;
static sig_t old_winch_handler;
#define KB (1024UL)
#define MB (1024UL * KB)
#define GB (1024UL * MB)
#define SNAPSHOT_CHUNK (4 * MB)
#define PROG_BUF_SZ (8192)
#define BHYVE_RUN_DIR "/var/run/bhyve"
#define CHECKPOINT_RUN_DIR BHYVE_RUN_DIR "/checkpoint"
#define MAX_VMNAME 100
#define MAX_MSG_SIZE 1024
#define SNAPSHOT_BUFFER_SIZE (20 * MB)
-#define JSON_STRUCT_ARR_KEY "structs"
+#define JSON_KERNEL_ARR_KEY "kern_structs"
#define JSON_DEV_ARR_KEY "devices"
-#define JSON_BASIC_METADATA_KEY "basic metadata"
-#define JSON_SNAPSHOT_REQ_KEY "snapshot_req"
+#define JSON_BASIC_METADATA_KEY "basic metadata"
+#define JSON_SNAPSHOT_REQ_KEY "device"
#define JSON_SIZE_KEY "size"
-#define JSON_FILE_OFFSET_KEY "file_offset"
+#define JSON_FILE_OFFSET_KEY "file_offset"
#define JSON_NCPUS_KEY "ncpus"
#define JSON_VMNAME_KEY "vmname"
#define JSON_MEMSIZE_KEY "memsize"
#define JSON_MEMFLAGS_KEY "memflags"
#define min(a,b) \
({ \
__typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a < _b ? _a : _b; \
})
-const struct vm_snapshot_dev_info snapshot_devs[] = {
- { "atkbdc", atkbdc_snapshot, NULL, NULL },
- { "virtio-net", pci_snapshot, pci_pause, pci_resume },
- { "virtio-blk", pci_snapshot, pci_pause, pci_resume },
- { "virtio-rnd", pci_snapshot, NULL, NULL },
- { "lpc", pci_snapshot, NULL, NULL },
- { "fbuf", pci_snapshot, NULL, NULL },
- { "xhci", pci_snapshot, NULL, NULL },
- { "e1000", pci_snapshot, NULL, NULL },
- { "ahci", pci_snapshot, pci_pause, pci_resume },
- { "ahci-hd", pci_snapshot, pci_pause, pci_resume },
- { "ahci-cd", pci_snapshot, pci_pause, pci_resume },
-};
-
const struct vm_snapshot_kern_info snapshot_kern_structs[] = {
{ "vhpet", STRUCT_VHPET },
{ "vm", STRUCT_VM },
{ "vmx", STRUCT_VMX },
{ "vioapic", STRUCT_VIOAPIC },
{ "vlapic", STRUCT_VLAPIC },
{ "vmcx", STRUCT_VMCX },
{ "vatpit", STRUCT_VATPIT },
{ "vatpic", STRUCT_VATPIC },
{ "vpmtmr", STRUCT_VPMTMR },
{ "vrtc", STRUCT_VRTC },
};
static cpuset_t vcpus_active, vcpus_suspended;
static pthread_mutex_t vcpu_lock;
static pthread_cond_t vcpus_idle, vcpus_can_run;
static bool checkpoint_active;
+struct snapshot_dev {
+ LIST_ENTRY(snapshot_dev) dev_link;
+ const char *dev_name;
+ struct snapshot_ops *dev_ops;
+ void *dev_cbdata;
+};
+
+static LIST_HEAD(, snapshot_dev) snapshot_devices;
+
+void
+register_snapshot_dev(const char *name, struct snapshot_ops *ops,
+ void *cbdata)
+{
+ struct snapshot_dev *dev;
+
+ assert(ops != NULL && ops->snapshot_cb != NULL);
+
+ dev = calloc(1, sizeof (struct snapshot_dev));
+
+ dev->dev_name = name;
+ dev->dev_ops = ops;
+ dev->dev_cbdata = cbdata;
+ LIST_INSERT_HEAD(&snapshot_devices, dev, dev_link);
+}
+
/*
* TODO: Harden this function and all of its callers since 'base_str' is a user
* provided string.
*/
static char *
strcat_extension(const char *base_str, const char *ext)
{
char *res;
size_t base_len, ext_len;
base_len = strnlen(base_str, MAX_VMNAME);
ext_len = strnlen(ext, MAX_VMNAME);
if (base_len + ext_len > MAX_VMNAME) {
fprintf(stderr, "Filename exceeds maximum length.\n");
return (NULL);
}
res = malloc(base_len + ext_len + 1);
if (res == NULL) {
perror("Failed to allocate memory.");
return (NULL);
}
memcpy(res, base_str, base_len);
memcpy(res + base_len, ext, ext_len);
res[base_len + ext_len] = 0;
return (res);
}
void
destroy_restore_state(struct restore_state *rstate)
{
if (rstate == NULL) {
fprintf(stderr, "Attempting to destroy NULL restore struct.\n");
return;
}
if (rstate->kdata_map != MAP_FAILED)
munmap(rstate->kdata_map, rstate->kdata_len);
if (rstate->kdata_fd > 0)
close(rstate->kdata_fd);
if (rstate->vmmem_fd > 0)
close(rstate->vmmem_fd);
if (rstate->meta_root_obj != NULL)
ucl_object_unref(rstate->meta_root_obj);
if (rstate->meta_parser != NULL)
ucl_parser_free(rstate->meta_parser);
}
static int
load_vmmem_file(const char *filename, struct restore_state *rstate)
{
struct stat sb;
int err;
rstate->vmmem_fd = open(filename, O_RDONLY);
if (rstate->vmmem_fd < 0) {
perror("Failed to open restore file");
return (-1);
}
err = fstat(rstate->vmmem_fd, &sb);
if (err < 0) {
perror("Failed to stat restore file");
goto err_load_vmmem;
}
if (sb.st_size == 0) {
fprintf(stderr, "Restore file is empty.\n");
goto err_load_vmmem;
}
rstate->vmmem_len = sb.st_size;
return (0);
err_load_vmmem:
if (rstate->vmmem_fd > 0)
close(rstate->vmmem_fd);
return (-1);
}
static int
load_kdata_file(const char *filename, struct restore_state *rstate)
{
struct stat sb;
int err;
rstate->kdata_fd = open(filename, O_RDONLY);
if (rstate->kdata_fd < 0) {
perror("Failed to open kernel data file");
return (-1);
}
err = fstat(rstate->kdata_fd, &sb);
if (err < 0) {
perror("Failed to stat kernel data file");
goto err_load_kdata;
}
if (sb.st_size == 0) {
fprintf(stderr, "Kernel data file is empty.\n");
goto err_load_kdata;
}
rstate->kdata_len = sb.st_size;
rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ,
MAP_SHARED, rstate->kdata_fd, 0);
if (rstate->kdata_map == MAP_FAILED) {
perror("Failed to map restore file");
goto err_load_kdata;
}
return (0);
err_load_kdata:
if (rstate->kdata_fd > 0)
close(rstate->kdata_fd);
return (-1);
}
static int
load_metadata_file(const char *filename, struct restore_state *rstate)
{
const ucl_object_t *obj;
struct ucl_parser *parser;
int err;
parser = ucl_parser_new(UCL_PARSER_DEFAULT);
if (parser == NULL) {
fprintf(stderr, "Failed to initialize UCL parser.\n");
goto err_load_metadata;
}
err = ucl_parser_add_file(parser, filename);
if (err == 0) {
fprintf(stderr, "Failed to parse metadata file: '%s'\n",
filename);
err = -1;
goto err_load_metadata;
}
obj = ucl_parser_get_object(parser);
if (obj == NULL) {
fprintf(stderr, "Failed to parse object.\n");
err = -1;
goto err_load_metadata;
}
rstate->meta_parser = parser;
rstate->meta_root_obj = (ucl_object_t *)obj;
return (0);
err_load_metadata:
if (parser != NULL)
ucl_parser_free(parser);
return (err);
}
int
load_restore_file(const char *filename, struct restore_state *rstate)
{
int err = 0;
char *kdata_filename = NULL, *meta_filename = NULL;
assert(filename != NULL);
assert(rstate != NULL);
memset(rstate, 0, sizeof(*rstate));
rstate->kdata_map = MAP_FAILED;
err = load_vmmem_file(filename, rstate);
if (err != 0) {
fprintf(stderr, "Failed to load guest RAM file.\n");
goto err_restore;
}
kdata_filename = strcat_extension(filename, ".kern");
if (kdata_filename == NULL) {
fprintf(stderr, "Failed to construct kernel data filename.\n");
goto err_restore;
}
err = load_kdata_file(kdata_filename, rstate);
if (err != 0) {
fprintf(stderr, "Failed to load guest kernel data file.\n");
goto err_restore;
}
meta_filename = strcat_extension(filename, ".meta");
if (meta_filename == NULL) {
fprintf(stderr, "Failed to construct kernel metadata filename.\n");
goto err_restore;
}
err = load_metadata_file(meta_filename, rstate);
if (err != 0) {
fprintf(stderr, "Failed to load guest metadata file.\n");
goto err_restore;
}
return (0);
err_restore:
destroy_restore_state(rstate);
if (kdata_filename != NULL)
free(kdata_filename);
if (meta_filename != NULL)
free(meta_filename);
return (-1);
}
#define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \
do { \
const ucl_object_t *obj__; \
obj__ = ucl_object_lookup(obj, key); \
if (obj__ == NULL) { \
fprintf(stderr, "Missing key: '%s'", key); \
return (ret); \
} \
if (!ucl_object_toint_safe(obj__, result_ptr)) { \
fprintf(stderr, "Cannot convert '%s' value to int.", key); \
return (ret); \
} \
} while(0)
#define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \
do { \
const ucl_object_t *obj__; \
obj__ = ucl_object_lookup(obj, key); \
if (obj__ == NULL) { \
fprintf(stderr, "Missing key: '%s'", key); \
return (ret); \
} \
if (!ucl_object_tostring_safe(obj__, result_ptr)) { \
fprintf(stderr, "Cannot convert '%s' value to string.", key); \
return (ret); \
} \
} while(0)
static void *
lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate,
size_t *struct_size)
{
const ucl_object_t *structs = NULL, *obj = NULL;
ucl_object_iter_t it = NULL;
int64_t snapshot_req, size, file_offset;
- structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY);
+ structs = ucl_object_lookup(rstate->meta_root_obj, JSON_KERNEL_ARR_KEY);
if (structs == NULL) {
fprintf(stderr, "Failed to find '%s' object.\n",
- JSON_STRUCT_ARR_KEY);
+ JSON_KERNEL_ARR_KEY);
return (NULL);
}
if (ucl_object_type((ucl_object_t *)structs) != UCL_ARRAY) {
fprintf(stderr, "Object '%s' is not an array.\n",
- JSON_STRUCT_ARR_KEY);
+ JSON_KERNEL_ARR_KEY);
return (NULL);
}
while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) {
snapshot_req = -1;
JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj,
&snapshot_req, NULL);
assert(snapshot_req >= 0);
if ((enum snapshot_req) snapshot_req == struct_id) {
JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj,
&size, NULL);
assert(size >= 0);
JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj,
&file_offset, NULL);
assert(file_offset >= 0);
assert(file_offset + size <= rstate->kdata_len);
*struct_size = (size_t)size;
return (rstate->kdata_map + file_offset);
}
}
return (NULL);
}
static void *
lookup_check_dev(const char *dev_name, struct restore_state *rstate,
const ucl_object_t *obj, size_t *data_size)
{
const char *snapshot_req;
int64_t size, file_offset;
snapshot_req = NULL;
JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj,
&snapshot_req, NULL);
assert(snapshot_req != NULL);
if (!strcmp(snapshot_req, dev_name)) {
JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj,
&size, NULL);
assert(size >= 0);
JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj,
&file_offset, NULL);
assert(file_offset >= 0);
assert(file_offset + size <= rstate->kdata_len);
*data_size = (size_t)size;
return (rstate->kdata_map + file_offset);
}
return (NULL);
}
static void*
lookup_dev(const char *dev_name, struct restore_state *rstate,
size_t *data_size)
{
const ucl_object_t *devs = NULL, *obj = NULL;
ucl_object_iter_t it = NULL;
void *ret;
devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY);
if (devs == NULL) {
fprintf(stderr, "Failed to find '%s' object.\n",
JSON_DEV_ARR_KEY);
return (NULL);
}
if (ucl_object_type((ucl_object_t *)devs) != UCL_ARRAY) {
fprintf(stderr, "Object '%s' is not an array.\n",
JSON_DEV_ARR_KEY);
return (NULL);
}
while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) {
ret = lookup_check_dev(dev_name, rstate, obj, data_size);
if (ret != NULL)
return (ret);
}
return (NULL);
}
static const ucl_object_t *
lookup_basic_metadata_object(struct restore_state *rstate)
{
const ucl_object_t *basic_meta_obj = NULL;
basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj,
JSON_BASIC_METADATA_KEY);
if (basic_meta_obj == NULL) {
fprintf(stderr, "Failed to find '%s' object.\n",
JSON_BASIC_METADATA_KEY);
return (NULL);
}
if (ucl_object_type((ucl_object_t *)basic_meta_obj) != UCL_OBJECT) {
fprintf(stderr, "Object '%s' is not a JSON object.\n",
JSON_BASIC_METADATA_KEY);
return (NULL);
}
return (basic_meta_obj);
}
const char *
lookup_vmname(struct restore_state *rstate)
{
const char *vmname;
const ucl_object_t *obj;
obj = lookup_basic_metadata_object(rstate);
if (obj == NULL)
return (NULL);
JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL);
return (vmname);
}
int
lookup_memflags(struct restore_state *rstate)
{
int64_t memflags;
const ucl_object_t *obj;
obj = lookup_basic_metadata_object(rstate);
if (obj == NULL)
return (0);
JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0);
return ((int)memflags);
}
size_t
lookup_memsize(struct restore_state *rstate)
{
int64_t memsize;
const ucl_object_t *obj;
obj = lookup_basic_metadata_object(rstate);
if (obj == NULL)
return (0);
JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0);
if (memsize < 0)
memsize = 0;
return ((size_t)memsize);
}
int
lookup_guest_ncpus(struct restore_state *rstate)
{
int64_t ncpus;
const ucl_object_t *obj;
obj = lookup_basic_metadata_object(rstate);
if (obj == NULL)
return (0);
JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0);
return ((int)ncpus);
}
static void
winch_handler(int signal)
{
#ifdef TIOCGWINSZ
ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize);
#endif /* TIOCGWINSZ */
}
static int
print_progress(size_t crtval, const size_t maxval)
{
size_t rc;
double crtval_gb, maxval_gb;
size_t i, win_width, prog_start, prog_done, prog_end;
int mval_len;
static char prog_buf[PROG_BUF_SZ];
static const size_t len = sizeof(prog_buf);
static size_t div;
static char *div_str;
static char wip_bar[] = { '/', '-', '\\', '|' };
static int wip_idx = 0;
if (maxval == 0) {
printf("[0B / 0B]\r\n");
return (0);
}
if (crtval > maxval)
crtval = maxval;
if (maxval > 10 * GB) {
div = GB;
div_str = "GiB";
} else if (maxval > 10 * MB) {
div = MB;
div_str = "MiB";
} else {
div = KB;
div_str = "KiB";
}
crtval_gb = (double) crtval / div;
maxval_gb = (double) maxval / div;
rc = snprintf(prog_buf, len, "%.03lf", maxval_gb);
if (rc == len) {
fprintf(stderr, "Maxval too big\n");
return (-1);
}
mval_len = rc;
rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |",
mval_len, crtval_gb, div_str, maxval_gb, div_str);
if (rc == len) {
fprintf(stderr, "Buffer too small to print progress\n");
return (-1);
}
win_width = min(winsize.ws_col, len);
prog_start = rc;
if (prog_start < (win_width - 2)) {
prog_end = win_width - prog_start - 2;
prog_done = prog_end * (crtval_gb / maxval_gb);
for (i = prog_start; i < prog_start + prog_done; i++)
prog_buf[i] = '#';
if (crtval != maxval) {
prog_buf[i] = wip_bar[wip_idx];
wip_idx = (wip_idx + 1) % sizeof(wip_bar);
i++;
} else {
prog_buf[i++] = '#';
}
for (; i < win_width - 2; i++)
prog_buf[i] = '_';
prog_buf[win_width - 2] = '|';
}
prog_buf[win_width - 1] = '\0';
write(STDOUT_FILENO, prog_buf, win_width);
return (0);
}
static void *
snapshot_spinner_cb(void *arg)
{
int rc;
size_t crtval, maxval, total;
struct spinner_info *si;
struct timespec ts;
si = arg;
if (si == NULL)
pthread_exit(NULL);
ts.tv_sec = 0;
ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */
do {
crtval = *si->crtval;
maxval = si->maxval;
total = si->total;
rc = print_progress(crtval, total);
if (rc < 0) {
fprintf(stderr, "Failed to parse progress\n");
break;
}
nanosleep(&ts, NULL);
} while (crtval < maxval);
pthread_exit(NULL);
return NULL;
}
static int
vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src,
const size_t len, const size_t totalmem, const bool op_wr)
{
int rc;
size_t part_done, todo, rem;
ssize_t done;
bool show_progress;
pthread_t spinner_th;
struct spinner_info *si;
if (lseek(snapfd, foff, SEEK_SET) < 0) {
perror("Failed to change file offset");
return (-1);
}
show_progress = false;
if (isatty(STDIN_FILENO) && (winsize.ws_col != 0))
show_progress = true;
part_done = foff;
rem = len;
if (show_progress) {
si = &(struct spinner_info) {
.crtval = &part_done,
.maxval = foff + len,
.total = totalmem
};
rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si);
if (rc) {
perror("Unable to create spinner thread");
show_progress = false;
}
}
while (rem > 0) {
if (show_progress)
todo = min(SNAPSHOT_CHUNK, rem);
else
todo = rem;
if (op_wr)
done = write(snapfd, src, todo);
else
done = read(snapfd, src, todo);
if (done < 0) {
perror("Failed to write in file");
return (-1);
}
src += done;
part_done += done;
rem -= done;
}
if (show_progress) {
rc = pthread_join(spinner_th, NULL);
if (rc)
perror("Unable to end spinner thread");
}
return (0);
}
static size_t
vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr)
{
int ret;
size_t lowmem, highmem, totalmem;
char *baseaddr;
ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem);
if (ret) {
fprintf(stderr, "%s: unable to retrieve guest memory size\r\n",
__func__);
return (0);
}
totalmem = lowmem + highmem;
if ((op_wr == false) && (totalmem != memsz)) {
fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n",
__func__, totalmem, memsz);
return (0);
}
winsize.ws_col = 80;
#ifdef TIOCGWINSZ
ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize);
#endif /* TIOCGWINSZ */
old_winch_handler = signal(SIGWINCH, winch_handler);
ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem,
totalmem, op_wr);
if (ret) {
fprintf(stderr, "%s: Could not %s lowmem\r\n",
__func__, op_wr ? "write" : "read");
totalmem = 0;
goto done;
}
if (highmem == 0)
goto done;
ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB,
highmem, totalmem, op_wr);
if (ret) {
fprintf(stderr, "%s: Could not %s highmem\r\n",
__func__, op_wr ? "write" : "read");
totalmem = 0;
goto done;
}
done:
printf("\r\n");
signal(SIGWINCH, old_winch_handler);
return (totalmem);
}
int
restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate)
{
size_t restored;
restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len,
false);
if (restored != rstate->vmmem_len)
return (-1);
return (0);
}
static int
vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate,
const struct vm_snapshot_kern_info *info)
{
void *struct_ptr;
size_t struct_size;
int ret;
struct vm_snapshot_meta *meta;
struct_ptr = lookup_struct(info->req, rstate, &struct_size);
if (struct_ptr == NULL) {
fprintf(stderr, "%s: Failed to lookup struct %s\r\n",
__func__, info->struct_name);
ret = -1;
goto done;
}
if (struct_size == 0) {
fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n",
__func__, info->struct_name);
ret = -1;
goto done;
}
meta = &(struct vm_snapshot_meta) {
.ctx = ctx,
.dev_name = info->struct_name,
.dev_req = info->req,
.buffer.buf_start = struct_ptr,
.buffer.buf_size = struct_size,
.buffer.buf = struct_ptr,
.buffer.buf_rem = struct_size,
.op = VM_SNAPSHOT_RESTORE,
};
ret = vm_snapshot_req(meta);
if (ret != 0) {
fprintf(stderr, "%s: Failed to restore struct: %s\r\n",
__func__, info->struct_name);
goto done;
}
done:
return (ret);
}
int
vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate)
{
int ret;
int i;
for (i = 0; i < nitems(snapshot_kern_structs); i++) {
ret = vm_restore_kern_struct(ctx, rstate,
&snapshot_kern_structs[i]);
if (ret != 0)
return (ret);
}
return (0);
}
int
-vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate,
- const struct vm_snapshot_dev_info *info)
+vm_restore_device(struct vmctx *ctx, struct restore_state *rstate,
+ struct snapshot_dev *dev)
{
void *dev_ptr;
size_t dev_size;
int ret;
struct vm_snapshot_meta *meta;
- dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size);
+ dev_ptr = lookup_dev(dev->dev_name, rstate, &dev_size);
if (dev_ptr == NULL) {
- fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name);
+ fprintf(stderr, "Failed to lookup dev: %s\r\n", dev->dev_name);
fprintf(stderr, "Continuing the restore/migration process\r\n");
return (0);
}
if (dev_size == 0) {
- fprintf(stderr, "%s: Device size is 0. "
- "Assuming %s is not used\r\n",
- __func__, info->dev_name);
+ fprintf(stderr, "%s: Device size is 0. Assuming %s is not used\r\n",
+ __func__, dev->dev_name);
return (0);
}
meta = &(struct vm_snapshot_meta) {
.ctx = ctx,
- .dev_name = info->dev_name,
+ .dev_name = dev->dev_name,
.buffer.buf_start = dev_ptr,
.buffer.buf_size = dev_size,
.buffer.buf = dev_ptr,
.buffer.buf_rem = dev_size,
.op = VM_SNAPSHOT_RESTORE,
};
- ret = (*info->snapshot_cb)(meta);
+ ret = (dev->dev_ops->snapshot_cb)(meta, dev->dev_cbdata);
if (ret != 0) {
- fprintf(stderr, "Failed to restore dev: %s\r\n",
- info->dev_name);
+ fprintf(stderr, "Failed to restore dev: %s\r\n", dev->dev_name);
return (-1);
}
return (0);
}
-
int
-vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate)
+vm_restore_devices(struct vmctx *ctx, struct restore_state *rstate)
{
+ struct snapshot_dev *dev;
int ret;
- int i;
- for (i = 0; i < nitems(snapshot_devs); i++) {
- ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]);
+ LIST_FOREACH(dev, &snapshot_devices, dev_link) {
+ ret = vm_restore_device(ctx, rstate, dev);
if (ret != 0)
return (ret);
}
return 0;
}
int
-vm_pause_user_devs(struct vmctx *ctx)
+vm_pause_devices(struct vmctx *ctx)
{
- const struct vm_snapshot_dev_info *info;
- int ret;
- int i;
-
- for (i = 0; i < nitems(snapshot_devs); i++) {
- info = &snapshot_devs[i];
- if (info->pause_cb == NULL)
- continue;
+ struct snapshot_dev *dev;
+ int err;
- ret = info->pause_cb(ctx, info->dev_name);
- if (ret != 0)
- return (ret);
+ LIST_FOREACH(dev, &snapshot_devices, dev_link) {
+ if (dev->dev_ops->pause_cb)
+ err = dev->dev_ops->pause_cb(ctx, dev->dev_cbdata);
+ if (err != 0)
+ return (err);
}
return (0);
}
int
-vm_resume_user_devs(struct vmctx *ctx)
+vm_resume_devices(struct vmctx *ctx)
{
- const struct vm_snapshot_dev_info *info;
- int ret;
- int i;
-
- for (i = 0; i < nitems(snapshot_devs); i++) {
- info = &snapshot_devs[i];
- if (info->resume_cb == NULL)
- continue;
+ struct snapshot_dev *dev;
+ int err;
- ret = info->resume_cb(ctx, info->dev_name);
- if (ret != 0)
- return (ret);
+ LIST_FOREACH(dev, &snapshot_devices, dev_link) {
+ if (dev->dev_ops->resume_cb)
+ err = dev->dev_ops->resume_cb(ctx, dev->dev_cbdata);
+ if (err != 0)
+ return (err);
}
return (0);
}
static int
-vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key,
+vm_save_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key,
struct vm_snapshot_meta *meta, off_t *offset)
{
int ret;
size_t data_size;
ssize_t write_cnt;
ret = vm_snapshot_req(meta);
if (ret != 0) {
fprintf(stderr, "%s: Failed to snapshot struct %s\r\n",
__func__, meta->dev_name);
ret = -1;
goto done;
}
data_size = vm_get_snapshot_size(meta);
write_cnt = write(data_fd, meta->buffer.buf_start, data_size);
if (write_cnt != data_size) {
perror("Failed to write all snapshotted data.");
ret = -1;
goto done;
}
/* Write metadata. */
xo_open_instance_h(xop, array_key);
xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name);
xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n",
meta->dev_req);
xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size);
xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset);
- xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY);
+ xo_close_instance_h(xop, JSON_KERNEL_ARR_KEY);
*offset += data_size;
done:
return (ret);
}
static int
-vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
+vm_save_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
{
int ret, i, error;
size_t offset, buf_size;
char *buffer;
struct vm_snapshot_meta *meta;
error = 0;
- offset = 0;
buf_size = SNAPSHOT_BUFFER_SIZE;
+ offset = lseek(data_fd, 0, SEEK_CUR);
+ if (offset < 0) {
+ perror("Failed to get data file current offset.");
+ return (-1);
+ }
+
buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char));
if (buffer == NULL) {
error = ENOMEM;
perror("Failed to allocate memory for snapshot buffer");
goto err_vm_snapshot_kern_data;
}
meta = &(struct vm_snapshot_meta) {
.ctx = ctx,
.buffer.buf_start = buffer,
.buffer.buf_size = buf_size,
.op = VM_SNAPSHOT_SAVE,
};
- xo_open_list_h(xop, JSON_STRUCT_ARR_KEY);
+ xo_open_list_h(xop, JSON_KERNEL_ARR_KEY);
for (i = 0; i < nitems(snapshot_kern_structs); i++) {
meta->dev_name = snapshot_kern_structs[i].struct_name;
meta->dev_req = snapshot_kern_structs[i].req;
memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
meta->buffer.buf = meta->buffer.buf_start;
meta->buffer.buf_rem = meta->buffer.buf_size;
- ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY,
+ ret = vm_save_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY,
meta, &offset);
if (ret != 0) {
error = -1;
goto err_vm_snapshot_kern_data;
}
}
- xo_close_list_h(xop, JSON_STRUCT_ARR_KEY);
+ xo_close_list_h(xop, JSON_KERNEL_ARR_KEY);
err_vm_snapshot_kern_data:
if (buffer != NULL)
free(buffer);
return (error);
}
static int
-vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz)
+vm_save_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz)
{
int error;
int memflags;
char vmname_buf[MAX_VMNAME];
memset(vmname_buf, 0, MAX_VMNAME);
error = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
if (error != 0) {
perror("Failed to get VM name");
goto err;
}
memflags = vm_get_memflags(ctx);
xo_open_container_h(xop, JSON_BASIC_METADATA_KEY);
xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus);
xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vmname_buf);
xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz);
xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", memflags);
xo_close_container_h(xop, JSON_BASIC_METADATA_KEY);
err:
return (error);
}
static int
vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key,
struct vm_snapshot_meta *meta, off_t *offset)
{
int ret;
size_t data_size;
data_size = vm_get_snapshot_size(meta);
ret = write(data_fd, meta->buffer.buf_start, data_size);
if (ret != data_size) {
perror("Failed to write all snapshotted data.");
return (-1);
}
/* Write metadata. */
xo_open_instance_h(xop, array_key);
xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name);
xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size);
xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset);
xo_close_instance_h(xop, array_key);
*offset += data_size;
return (0);
}
static int
-vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info,
- int data_fd, xo_handle_t *xop,
- struct vm_snapshot_meta *meta, off_t *offset)
+vm_save_device(struct snapshot_dev *dev,
+ int data_fd, xo_handle_t *xop,
+ struct vm_snapshot_meta *meta, off_t *offset)
{
int ret;
- ret = (*info->snapshot_cb)(meta);
+ fprintf(stderr, "Doing the snapshot for: %s \n", dev->dev_name);
+
+ meta->dev_name = dev->dev_name;
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ ret = (dev->dev_ops->snapshot_cb)(meta, dev->dev_cbdata);
if (ret != 0) {
fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n",
meta->dev_name, ret);
return (ret);
}
ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta,
offset);
if (ret != 0)
return (ret);
return (0);
}
static int
-vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
+vm_save_devices(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
{
- int ret, i;
+ int ret;
off_t offset;
void *buffer;
size_t buf_size;
struct vm_snapshot_meta *meta;
+ struct snapshot_dev *dev;
buf_size = SNAPSHOT_BUFFER_SIZE;
offset = lseek(data_fd, 0, SEEK_CUR);
if (offset < 0) {
perror("Failed to get data file current offset.");
return (-1);
}
buffer = malloc(buf_size);
if (buffer == NULL) {
perror("Failed to allocate memory for snapshot buffer");
ret = ENOSPC;
goto snapshot_err;
}
meta = &(struct vm_snapshot_meta) {
.ctx = ctx,
.buffer.buf_start = buffer,
.buffer.buf_size = buf_size,
.op = VM_SNAPSHOT_SAVE,
};
xo_open_list_h(xop, JSON_DEV_ARR_KEY);
/* Restore other devices that support this feature */
- for (i = 0; i < nitems(snapshot_devs); i++) {
- meta->dev_name = snapshot_devs[i].dev_name;
-
- memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
- meta->buffer.buf = meta->buffer.buf_start;
- meta->buffer.buf_rem = meta->buffer.buf_size;
-
- ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop,
- meta, &offset);
+ LIST_FOREACH(dev, &snapshot_devices, dev_link) {
+ ret = vm_save_device(dev, data_fd, xop, meta, &offset);
if (ret != 0)
goto snapshot_err;
}
xo_close_list_h(xop, JSON_DEV_ARR_KEY);
snapshot_err:
if (buffer != NULL)
free(buffer);
return (ret);
}
void
checkpoint_cpu_add(int vcpu)
{
pthread_mutex_lock(&vcpu_lock);
CPU_SET(vcpu, &vcpus_active);
if (checkpoint_active) {
CPU_SET(vcpu, &vcpus_suspended);
while (checkpoint_active)
pthread_cond_wait(&vcpus_can_run, &vcpu_lock);
CPU_CLR(vcpu, &vcpus_suspended);
}
pthread_mutex_unlock(&vcpu_lock);
}
/*
* When a vCPU is suspended for any reason, it calls
* checkpoint_cpu_suspend(). This records that the vCPU is idle.
* Before returning from suspension, checkpoint_cpu_resume() is
* called. In suspend we note that the vCPU is idle. In resume we
* pause the vCPU thread until the checkpoint is complete. The reason
* for the two-step process is that vCPUs might already be stopped in
* the debug server when a checkpoint is requested. This approach
* allows us to account for and handle those vCPUs.
*/
void
checkpoint_cpu_suspend(int vcpu)
{
pthread_mutex_lock(&vcpu_lock);
CPU_SET(vcpu, &vcpus_suspended);
if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0)
pthread_cond_signal(&vcpus_idle);
pthread_mutex_unlock(&vcpu_lock);
}
void
checkpoint_cpu_resume(int vcpu)
{
pthread_mutex_lock(&vcpu_lock);
while (checkpoint_active)
pthread_cond_wait(&vcpus_can_run, &vcpu_lock);
CPU_CLR(vcpu, &vcpus_suspended);
pthread_mutex_unlock(&vcpu_lock);
}
static void
vm_vcpu_pause(struct vmctx *ctx)
{
pthread_mutex_lock(&vcpu_lock);
checkpoint_active = true;
vm_suspend_cpu(ctx, -1);
while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0)
pthread_cond_wait(&vcpus_idle, &vcpu_lock);
pthread_mutex_unlock(&vcpu_lock);
}
static void
vm_vcpu_resume(struct vmctx *ctx)
{
pthread_mutex_lock(&vcpu_lock);
checkpoint_active = false;
pthread_mutex_unlock(&vcpu_lock);
vm_resume_cpu(ctx, -1);
pthread_cond_broadcast(&vcpus_can_run);
}
static int
vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, bool stop_vm)
{
int fd_checkpoint = 0, kdata_fd = 0;
int ret = 0;
int error = 0;
size_t memsz;
xo_handle_t *xop = NULL;
char *meta_filename = NULL;
char *kdata_filename = NULL;
FILE *meta_file = NULL;
kdata_filename = strcat_extension(checkpoint_file, ".kern");
if (kdata_filename == NULL) {
fprintf(stderr, "Failed to construct kernel data filename.\n");
return (-1);
}
kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700);
if (kdata_fd < 0) {
perror("Failed to open kernel data snapshot file.");
error = -1;
goto done;
}
fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700);
if (fd_checkpoint < 0) {
perror("Failed to create checkpoint file");
error = -1;
goto done;
}
meta_filename = strcat_extension(checkpoint_file, ".meta");
if (meta_filename == NULL) {
fprintf(stderr, "Failed to construct vm metadata filename.\n");
goto done;
}
meta_file = fopen(meta_filename, "w");
if (meta_file == NULL) {
perror("Failed to open vm metadata snapshot file.");
goto done;
}
xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY);
if (xop == NULL) {
perror("Failed to get libxo handle on metadata file.");
goto done;
}
vm_vcpu_pause(ctx);
- ret = vm_pause_user_devs(ctx);
+ ret = vm_pause_devices(ctx);
if (ret != 0) {
fprintf(stderr, "Could not pause devices\r\n");
error = ret;
goto done;
}
memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true);
if (memsz == 0) {
perror("Could not write guest memory to file");
error = -1;
goto done;
}
- ret = vm_snapshot_basic_metadata(ctx, xop, memsz);
+ ret = vm_save_basic_metadata(ctx, xop, memsz);
if (ret != 0) {
fprintf(stderr, "Failed to snapshot vm basic metadata.\n");
error = -1;
goto done;
}
-
- ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop);
+ ret = vm_save_kern_structs(ctx, kdata_fd, xop);
if (ret != 0) {
fprintf(stderr, "Failed to snapshot vm kernel data.\n");
error = -1;
goto done;
}
- ret = vm_snapshot_user_devs(ctx, kdata_fd, xop);
+ ret = vm_save_devices(ctx, kdata_fd, xop);
if (ret != 0) {
fprintf(stderr, "Failed to snapshot device state.\n");
error = -1;
goto done;
}
xo_finish_h(xop);
if (stop_vm) {
vm_destroy(ctx);
exit(0);
}
done:
- ret = vm_resume_user_devs(ctx);
+ ret = vm_resume_devices(ctx);
if (ret != 0)
fprintf(stderr, "Could not resume devices\r\n");
vm_vcpu_resume(ctx);
if (fd_checkpoint > 0)
close(fd_checkpoint);
if (meta_filename != NULL)
free(meta_filename);
if (kdata_filename != NULL)
free(kdata_filename);
if (xop != NULL)
xo_destroy(xop);
if (meta_file != NULL)
fclose(meta_file);
if (kdata_fd > 0)
close(kdata_fd);
return (error);
}
int
get_checkpoint_msg(int conn_fd, struct vmctx *ctx)
{
unsigned char buf[MAX_MSG_SIZE];
struct checkpoint_op *checkpoint_op;
int len, recv_len, total_recv = 0;
int err = 0;
len = sizeof(struct checkpoint_op); /* expected length */
while ((recv_len = recv(conn_fd, buf + total_recv, len - total_recv, 0)) > 0) {
total_recv += recv_len;
}
if (recv_len < 0) {
perror("Error while receiving data from bhyvectl");
err = -1;
goto done;
}
checkpoint_op = (struct checkpoint_op *)buf;
switch (checkpoint_op->op) {
case START_CHECKPOINT:
err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, false);
break;
case START_SUSPEND:
err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, true);
break;
default:
fprintf(stderr, "Unrecognized checkpoint operation.\n");
err = -1;
}
done:
close(conn_fd);
return (err);
}
/*
* Listen for commands from bhyvectl
*/
void *
checkpoint_thread(void *param)
{
struct checkpoint_thread_info *thread_info;
int conn_fd, ret;
pthread_set_name_np(pthread_self(), "checkpoint thread");
thread_info = (struct checkpoint_thread_info *)param;
while ((conn_fd = accept(thread_info->socket_fd, NULL, NULL)) > -1) {
ret = get_checkpoint_msg(conn_fd, thread_info->ctx);
if (ret != 0) {
fprintf(stderr, "Failed to read message on checkpoint "
"socket. Retrying.\n");
}
}
if (conn_fd < -1) {
perror("Failed to accept connection");
}
return (NULL);
}
/*
* Create directory tree to store runtime specific information:
* i.e. UNIX sockets for IPC with bhyvectl.
*/
static int
make_checkpoint_dir(void)
{
int err;
err = mkdir(BHYVE_RUN_DIR, 0755);
if (err < 0 && errno != EEXIST)
return (err);
err = mkdir(CHECKPOINT_RUN_DIR, 0755);
if (err < 0 && errno != EEXIST)
return (err);
return 0;
}
/*
* Create the listening socket for IPC with bhyvectl
*/
int
init_checkpoint_thread(struct vmctx *ctx)
{
struct checkpoint_thread_info *checkpoint_info = NULL;
struct sockaddr_un addr;
int socket_fd;
pthread_t checkpoint_pthread;
char vmname_buf[MAX_VMNAME];
int ret, err = 0;
memset(&addr, 0, sizeof(addr));
err = pthread_mutex_init(&vcpu_lock, NULL);
if (err != 0)
errc(1, err, "checkpoint mutex init");
err = pthread_cond_init(&vcpus_idle, NULL);
if (err == 0)
err = pthread_cond_init(&vcpus_can_run, NULL);
if (err != 0)
errc(1, err, "checkpoint cv init");
socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
if (socket_fd < 0) {
perror("Socket creation failed (IPC with bhyvectl");
err = -1;
goto fail;
}
err = make_checkpoint_dir();
if (err < 0) {
perror("Failed to create checkpoint runtime directory");
goto fail;
}
addr.sun_family = AF_UNIX;
err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
if (err != 0) {
perror("Failed to get VM name");
goto fail;
}
snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s",
CHECKPOINT_RUN_DIR, vmname_buf);
addr.sun_len = SUN_LEN(&addr);
unlink(addr.sun_path);
if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) {
perror("Failed to bind socket (IPC with bhyvectl)");
err = -1;
goto fail;
}
if (listen(socket_fd, 10) < 0) {
perror("Failed to listen on socket (IPC with bhyvectl)");
err = -1;
goto fail;
}
checkpoint_info = calloc(1, sizeof(*checkpoint_info));
checkpoint_info->ctx = ctx;
checkpoint_info->socket_fd = socket_fd;
ret = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread,
checkpoint_info);
if (ret < 0) {
err = ret;
goto fail;
}
return (0);
fail:
free(checkpoint_info);
if (socket_fd > 0)
close(socket_fd);
unlink(addr.sun_path);
return (err);
}
void
vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op)
{
const char *__op;
if (op == VM_SNAPSHOT_SAVE)
__op = "save";
else if (op == VM_SNAPSHOT_RESTORE)
__op = "restore";
else
__op = "unknown";
fprintf(stderr, "%s: snapshot-%s failed for %s\r\n",
__func__, __op, bufname);
}
int
vm_snapshot_buf(volatile void *data, size_t data_size,
struct vm_snapshot_meta *meta)
{
struct vm_snapshot_buffer *buffer;
int op;
buffer = &meta->buffer;
op = meta->op;
if (buffer->buf_rem < data_size) {
fprintf(stderr, "%s: buffer too small\r\n", __func__);
return (E2BIG);
}
if (op == VM_SNAPSHOT_SAVE)
memcpy(buffer->buf, (uint8_t *) data, data_size);
else if (op == VM_SNAPSHOT_RESTORE)
memcpy((uint8_t *) data, buffer->buf, data_size);
else
return (EINVAL);
buffer->buf += data_size;
buffer->buf_rem -= data_size;
return (0);
}
size_t
vm_get_snapshot_size(struct vm_snapshot_meta *meta)
{
size_t length;
struct vm_snapshot_buffer *buffer;
buffer = &meta->buffer;
if (buffer->buf_size < buffer->buf_rem) {
fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n",
__func__, buffer->buf_size, buffer->buf_rem);
length = 0;
} else {
length = buffer->buf_size - buffer->buf_rem;
}
return (length);
}
int
vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null,
struct vm_snapshot_meta *meta)
{
int ret;
vm_paddr_t gaddr;
if (meta->op == VM_SNAPSHOT_SAVE) {
gaddr = paddr_host2guest(meta->ctx, *addrp);
if (gaddr == (vm_paddr_t) -1) {
if (!restore_null ||
(restore_null && (*addrp != NULL))) {
ret = EFAULT;
goto done;
}
}
SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done);
} else if (meta->op == VM_SNAPSHOT_RESTORE) {
SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done);
if (gaddr == (vm_paddr_t) -1) {
if (!restore_null) {
ret = EFAULT;
goto done;
}
}
*addrp = paddr_guest2host(meta->ctx, gaddr, len);
} else {
ret = EINVAL;
}
done:
return (ret);
}
int
vm_snapshot_buf_cmp(volatile void *data, size_t data_size,
struct vm_snapshot_meta *meta)
{
struct vm_snapshot_buffer *buffer;
int op;
int ret;
buffer = &meta->buffer;
op = meta->op;
if (buffer->buf_rem < data_size) {
fprintf(stderr, "%s: buffer too small\r\n", __func__);
ret = E2BIG;
goto done;
}
if (op == VM_SNAPSHOT_SAVE) {
ret = 0;
memcpy(buffer->buf, (uint8_t *) data, data_size);
} else if (op == VM_SNAPSHOT_RESTORE) {
ret = memcmp((uint8_t *) data, buffer->buf, data_size);
} else {
ret = EINVAL;
goto done;
}
buffer->buf += data_size;
buffer->buf_rem -= data_size;
done:
return (ret);
}
diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h
index f9ea3d573089..a46fe45a8459 100644
--- a/usr.sbin/bhyve/snapshot.h
+++ b/usr.sbin/bhyve/snapshot.h
@@ -1,105 +1,106 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2016 Flavius Anton
* Copyright (c) 2016 Mihai Tiganus
* Copyright (c) 2016-2019 Mihai Carabas
* Copyright (c) 2017-2019 Darius Mihai
* Copyright (c) 2017-2019 Elena Mihailescu
* Copyright (c) 2018-2019 Sergiu Weisz
* All rights reserved.
* The bhyve-snapshot feature was developed under sponsorships
* from Matthew Grooms.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _BHYVE_SNAPSHOT_
#define _BHYVE_SNAPSHOT_
#include <machine/vmm_snapshot.h>
#include <libxo/xo.h>
#include <ucl.h>
struct vmctx;
struct restore_state {
int kdata_fd;
int vmmem_fd;
void *kdata_map;
size_t kdata_len;
size_t vmmem_len;
struct ucl_parser *meta_parser;
ucl_object_t *meta_root_obj;
};
struct checkpoint_thread_info {
struct vmctx *ctx;
int socket_fd;
};
-typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *);
-typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *);
-typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *);
-
-struct vm_snapshot_dev_info {
- const char *dev_name; /* device name */
- vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */
- vm_pause_dev_cb pause_cb; /* callback for device pause */
- vm_resume_dev_cb resume_cb; /* callback for device resume */
+typedef int (*snapshot_dev_cb)(struct vm_snapshot_meta *meta, void *cbdata);
+typedef int (*pause_dev_cb) (struct vmctx *ctx, void *cbdata);
+typedef int (*resume_dev_cb) (struct vmctx *ctx, void *cbdata);
+
+struct snapshot_ops {
+ snapshot_dev_cb snapshot_cb; /* callback for device save/restore */
+ pause_dev_cb pause_cb; /* callback for device pause (optional) */
+ resume_dev_cb resume_cb; /* callback for device resume (optional) */
};
struct vm_snapshot_kern_info {
const char *struct_name; /* kernel structure name*/
enum snapshot_req req; /* request type */
};
+void register_snapshot_dev(const char *devname, struct snapshot_ops *ops,
+ void *cbdata);
void destroy_restore_state(struct restore_state *rstate);
const char *lookup_vmname(struct restore_state *rstate);
int lookup_memflags(struct restore_state *rstate);
size_t lookup_memsize(struct restore_state *rstate);
int lookup_guest_ncpus(struct restore_state *rstate);
void checkpoint_cpu_add(int vcpu);
void checkpoint_cpu_resume(int vcpu);
void checkpoint_cpu_suspend(int vcpu);
int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate);
int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate);
-int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate);
-int vm_pause_user_devs(struct vmctx *ctx);
-int vm_resume_user_devs(struct vmctx *ctx);
+int vm_restore_devices(struct vmctx *ctx, struct restore_state *rstate);
+int vm_pause_devices(struct vmctx *ctx);
+int vm_resume_devices(struct vmctx *ctx);
int get_checkpoint_msg(int conn_fd, struct vmctx *ctx);
void *checkpoint_thread(void *param);
int init_checkpoint_thread(struct vmctx *ctx);
int load_restore_file(const char *filename, struct restore_state *rstate);
#endif

File Metadata

Mime Type
text/x-diff
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3288403
Default Alt Text
multiple-devices-v1.patch (217 KB)

Event Timeline