Index: head/usr.sbin/bhyve/pci_virtio_console.c
===================================================================
--- head/usr.sbin/bhyve/pci_virtio_console.c	(revision 348928)
+++ head/usr.sbin/bhyve/pci_virtio_console.c	(revision 348929)
@@ -1,678 +1,678 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2016 iXsystems Inc.
  * All rights reserved.
  *
  * This software was developed by Jakub Klama <jceel@FreeBSD.org>
  * under sponsorship from iXsystems Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/linker_set.h>
 #include <sys/uio.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/un.h>
 
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <libgen.h>
 #include <sysexits.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "virtio.h"
 #include "mevent.h"
 #include "sockstream.h"
 
 #define	VTCON_RINGSZ	64
 #define	VTCON_MAXPORTS	16
 #define	VTCON_MAXQ	(VTCON_MAXPORTS * 2 + 2)
 
 #define	VTCON_DEVICE_READY	0
 #define	VTCON_DEVICE_ADD	1
 #define	VTCON_DEVICE_REMOVE	2
 #define	VTCON_PORT_READY	3
 #define	VTCON_CONSOLE_PORT	4
 #define	VTCON_CONSOLE_RESIZE	5
 #define	VTCON_PORT_OPEN		6
 #define	VTCON_PORT_NAME		7
 
 #define	VTCON_F_SIZE		0
 #define	VTCON_F_MULTIPORT	1
 #define	VTCON_F_EMERG_WRITE	2
 #define	VTCON_S_HOSTCAPS	\
     (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE)
 
 static int pci_vtcon_debug;
 #define DPRINTF(params) if (pci_vtcon_debug) printf params
 #define WPRINTF(params) printf params
 
 struct pci_vtcon_softc;
 struct pci_vtcon_port;
 struct pci_vtcon_config;
 typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *,
     int);
 
 struct pci_vtcon_port {
 	struct pci_vtcon_softc * vsp_sc;
 	int                      vsp_id;
 	const char *             vsp_name;
 	bool                     vsp_enabled;
 	bool                     vsp_console;
 	bool                     vsp_rx_ready;
 	bool                     vsp_open;
 	int                      vsp_rxq;
 	int                      vsp_txq;
 	void *                   vsp_arg;
 	pci_vtcon_cb_t *         vsp_cb;
 };
 
 struct pci_vtcon_sock
 {
 	struct pci_vtcon_port *  vss_port;
 	const char *             vss_path;
 	struct mevent *          vss_server_evp;
 	struct mevent *          vss_conn_evp;
 	int                      vss_server_fd;
 	int                      vss_conn_fd;
 	bool                     vss_open;
 };
 
 struct pci_vtcon_softc {
 	struct virtio_softc      vsc_vs;
 	struct vqueue_info       vsc_queues[VTCON_MAXQ];
 	pthread_mutex_t          vsc_mtx;
 	uint64_t                 vsc_cfg;
 	uint64_t                 vsc_features;
 	char *                   vsc_rootdir;
 	int                      vsc_kq;
 	int                      vsc_nports;
 	bool                     vsc_ready;
 	struct pci_vtcon_port    vsc_control_port;
  	struct pci_vtcon_port    vsc_ports[VTCON_MAXPORTS];
 	struct pci_vtcon_config *vsc_config;
 };
 
 struct pci_vtcon_config {
 	uint16_t cols;
 	uint16_t rows;
 	uint32_t max_nr_ports;
 	uint32_t emerg_wr;
 } __attribute__((packed));
 
 struct pci_vtcon_control {
 	uint32_t id;
 	uint16_t event;
 	uint16_t value;
 } __attribute__((packed));
 
 struct pci_vtcon_console_resize {
 	uint16_t cols;
 	uint16_t rows;
 } __attribute__((packed));
 
 static void pci_vtcon_reset(void *);
 static void pci_vtcon_notify_rx(void *, struct vqueue_info *);
 static void pci_vtcon_notify_tx(void *, struct vqueue_info *);
 static int pci_vtcon_cfgread(void *, int, int, uint32_t *);
 static int pci_vtcon_cfgwrite(void *, int, int, uint32_t);
 static void pci_vtcon_neg_features(void *, uint64_t);
 static void pci_vtcon_sock_accept(int, enum ev_type,  void *);
 static void pci_vtcon_sock_rx(int, enum ev_type, void *);
 static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *,
     int);
 static void pci_vtcon_control_send(struct pci_vtcon_softc *,
     struct pci_vtcon_control *, const void *, size_t);
 static void pci_vtcon_announce_port(struct pci_vtcon_port *);
 static void pci_vtcon_open_port(struct pci_vtcon_port *, bool);
 
 static struct virtio_consts vtcon_vi_consts = {
 	"vtcon",		/* our name */
 	VTCON_MAXQ,		/* we support VTCON_MAXQ virtqueues */
 	sizeof(struct pci_vtcon_config), /* config reg size */
 	pci_vtcon_reset,	/* reset */
 	NULL,			/* device-wide qnotify */
 	pci_vtcon_cfgread,	/* read virtio config */
 	pci_vtcon_cfgwrite,	/* write virtio config */
 	pci_vtcon_neg_features,	/* apply negotiated features */
 	VTCON_S_HOSTCAPS,	/* our capabilities */
 };
 
 
 static void
 pci_vtcon_reset(void *vsc)
 {
 	struct pci_vtcon_softc *sc;
 
 	sc = vsc;
 
 	DPRINTF(("vtcon: device reset requested!\n"));
 	vi_reset_dev(&sc->vsc_vs);
 }
 
 static void
 pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features)
 {
 	struct pci_vtcon_softc *sc = vsc;
 
 	sc->vsc_features = negotiated_features;
 }
 
 static int
 pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtcon_softc *sc = vsc;
 	void *ptr;
 
 	ptr = (uint8_t *)sc->vsc_config + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 static int
 pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val)
 {
 
 	return (0);
 }
 
 static inline struct pci_vtcon_port *
 pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq)
 {
 	uint16_t num = vq->vq_num;
 
 	if (num == 0 || num == 1)
 		return (&sc->vsc_ports[0]);
 
 	if (num == 2 || num == 3)
 		return (&sc->vsc_control_port);
 
 	return (&sc->vsc_ports[(num / 2) - 1]);
 }
 
 static inline struct vqueue_info *
 pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue)
 {
 	int qnum;
 
 	qnum = tx_queue ? port->vsp_txq : port->vsp_rxq;
 	return (&port->vsp_sc->vsc_queues[qnum]);
 }
 
 static struct pci_vtcon_port *
 pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name,
     pci_vtcon_cb_t *cb, void *arg)
 {
 	struct pci_vtcon_port *port;
 
 	if (sc->vsc_nports == VTCON_MAXPORTS) {
 		errno = EBUSY;
 		return (NULL);
 	}
 
 	port = &sc->vsc_ports[sc->vsc_nports++];
 	port->vsp_id = sc->vsc_nports - 1;
 	port->vsp_sc = sc;
 	port->vsp_name = name;
 	port->vsp_cb = cb;
 	port->vsp_arg = arg;
 
 	if (port->vsp_id == 0) {
 		/* port0 */
 		port->vsp_txq = 0;
 		port->vsp_rxq = 1;
 	} else {
 		port->vsp_txq = sc->vsc_nports * 2;
 		port->vsp_rxq = port->vsp_txq + 1;
 	}
 
 	port->vsp_enabled = true;
 	return (port);
 }
 
 static int
 pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name,
     const char *path)
 {
 	struct pci_vtcon_sock *sock;
 	struct sockaddr_un sun;
 	char *pathcopy;
 	int s = -1, fd = -1, error = 0;
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 #endif
 
 	sock = calloc(1, sizeof(struct pci_vtcon_sock));
 	if (sock == NULL) {
 		error = -1;
 		goto out;
 	}
 
 	s = socket(AF_UNIX, SOCK_STREAM, 0);
 	if (s < 0) {
 		error = -1;
 		goto out;
 	}
 
 	pathcopy = strdup(path);
 	if (pathcopy == NULL) {
 		error = -1;
 		goto out;
 	}
 
 	fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY);
 	if (fd < 0) {
 		free(pathcopy);
 		error = -1;
 		goto out;
 	}
 
 	sun.sun_family = AF_UNIX;
 	sun.sun_len = sizeof(struct sockaddr_un);
 	strcpy(pathcopy, path);
 	strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path));
 	free(pathcopy);
 
 	if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) {
 		error = -1;
 		goto out;
 	}
 
 	if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) {
 		error = -1;
 		goto out;
 	}
 
 	if (listen(s, 1) < 0) {
 		error = -1;
 		goto out;
 	}
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE);
 	if (caph_rights_limit(s, &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock);
 	if (sock->vss_port == NULL) {
 		error = -1;
 		goto out;
 	}
 
 	sock->vss_open = false;
 	sock->vss_conn_fd = -1;
 	sock->vss_server_fd = s;
 	sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept,
 	    sock);
 
 	if (sock->vss_server_evp == NULL) {
 		error = -1;
 		goto out;
 	}
 
 out:
 	if (fd != -1)
 		close(fd);
 
 	if (error != 0 && s != -1)
 		close(s);
 
 	return (error);
 }
 
 static void
 pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg)
 {
 	struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg;
 	int s;
 
 	s = accept(sock->vss_server_fd, NULL, NULL);
 	if (s < 0)
 		return;
 
 	if (sock->vss_open) {
 		close(s);
 		return;
 	}
 
 	sock->vss_open = true;
 	sock->vss_conn_fd = s;
 	sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock);
 
 	pci_vtcon_open_port(sock->vss_port, true);
 }
 
 static void
 pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg)
 {
 	struct pci_vtcon_port *port;
 	struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg;
 	struct vqueue_info *vq;
 	struct iovec iov;
 	static char dummybuf[2048];
 	int len, n;
 	uint16_t idx;
 
 	port = sock->vss_port;
 	vq = pci_vtcon_port_to_vq(port, true);
 
 	if (!sock->vss_open || !port->vsp_rx_ready) {
 		len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf));
 		if (len == 0)
 			goto close;
 
 		return;
 	}
 
 	if (!vq_has_descs(vq)) {
 		len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf));
 		vq_endchains(vq, 1);
 		if (len == 0)
 			goto close;
 
 		return;
 	}
 
 	do {
 		n = vq_getchain(vq, &idx, &iov, 1, NULL);
 		len = readv(sock->vss_conn_fd, &iov, n);
 
 		if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) {
 			vq_retchain(vq);
 			vq_endchains(vq, 0);
 			if (len == 0)
 				goto close;
 
 			return;
 		}
 
 		vq_relchain(vq, idx, len);
 	} while (vq_has_descs(vq));
 
 	vq_endchains(vq, 1);
 
 close:
 	mevent_delete_close(sock->vss_conn_evp);
 	sock->vss_conn_fd = -1;
 	sock->vss_open = false;
 }
 
 static void
 pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov,
     int niov)
 {
 	struct pci_vtcon_sock *sock;
 	int i, ret;
 
 	sock = (struct pci_vtcon_sock *)arg;
 
 	if (sock->vss_conn_fd == -1)
 		return;
 
 	for (i = 0; i < niov; i++) {
 		ret = stream_write(sock->vss_conn_fd, iov[i].iov_base,
 		    iov[i].iov_len);
 		if (ret <= 0)
 			break;
 	}
 
 	if (ret <= 0) {
 		mevent_delete_close(sock->vss_conn_evp);
 		sock->vss_conn_fd = -1;
 		sock->vss_open = false;
 	}
 }
 
 static void
 pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov,
     int niov)
 {
 	struct pci_vtcon_softc *sc;
 	struct pci_vtcon_port *tmp;
 	struct pci_vtcon_control resp, *ctrl;
 	int i;
 
 	assert(niov == 1);
 
 	sc = port->vsp_sc;
 	ctrl = (struct pci_vtcon_control *)iov->iov_base;
 
 	switch (ctrl->event) {
 	case VTCON_DEVICE_READY:
 		sc->vsc_ready = true;
 		/* set port ready events for registered ports */
 		for (i = 0; i < VTCON_MAXPORTS; i++) {
 			tmp = &sc->vsc_ports[i];
 			if (tmp->vsp_enabled)
 				pci_vtcon_announce_port(tmp);
 
 			if (tmp->vsp_open)
 				pci_vtcon_open_port(tmp, true);
 		}
 		break;
 
 	case VTCON_PORT_READY:
 		if (ctrl->id >= sc->vsc_nports) {
 			WPRINTF(("VTCON_PORT_READY event for unknown port %d\n",
 			    ctrl->id));
 			return;
 		}
 
 		tmp = &sc->vsc_ports[ctrl->id];
 		if (tmp->vsp_console) {
 			resp.event = VTCON_CONSOLE_PORT;
 			resp.id = ctrl->id;
 			resp.value = 1;
 			pci_vtcon_control_send(sc, &resp, NULL, 0);
 		}
 		break;
 	}
 }
 
 static void
 pci_vtcon_announce_port(struct pci_vtcon_port *port)
 {
 	struct pci_vtcon_control event;
 
 	event.id = port->vsp_id;
 	event.event = VTCON_DEVICE_ADD;
 	event.value = 1;
 	pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0);
 
 	event.event = VTCON_PORT_NAME;
 	pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name,
 	    strlen(port->vsp_name));
 }
 
 static void
 pci_vtcon_open_port(struct pci_vtcon_port *port, bool open)
 {
 	struct pci_vtcon_control event;
 
 	if (!port->vsp_sc->vsc_ready) {
 		port->vsp_open = true;
 		return;
 	}
 
 	event.id = port->vsp_id;
 	event.event = VTCON_PORT_OPEN;
 	event.value = (int)open;
 	pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0);
 }
 
 static void
 pci_vtcon_control_send(struct pci_vtcon_softc *sc,
     struct pci_vtcon_control *ctrl, const void *payload, size_t len)
 {
 	struct vqueue_info *vq;
 	struct iovec iov;
 	uint16_t idx;
 	int n;
 
 	vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true);
 
 	if (!vq_has_descs(vq))
 		return;
 
 	n = vq_getchain(vq, &idx, &iov, 1, NULL);
 
 	assert(n == 1);
 
 	memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control));
 	if (payload != NULL && len > 0)
 		memcpy(iov.iov_base + sizeof(struct pci_vtcon_control),
 		     payload, len);
 
 	vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len);
 	vq_endchains(vq, 1);
 }
     
 
 static void
 pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtcon_softc *sc;
 	struct pci_vtcon_port *port;
 	struct iovec iov[1];
 	uint16_t idx, n;
 	uint16_t flags[8];
 
 	sc = vsc;
 	port = pci_vtcon_vq_to_port(sc, vq);
 
 	while (vq_has_descs(vq)) {
 		n = vq_getchain(vq, &idx, iov, 1, flags);
 		assert(n >= 1);
 		if (port != NULL)
 			port->vsp_cb(port, port->vsp_arg, iov, 1);
 
 		/*
 		 * Release this chain and handle more
 		 */
 		vq_relchain(vq, idx, 0);
 	}
 	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
 }
 
 static void
 pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtcon_softc *sc;
 	struct pci_vtcon_port *port;
 
 	sc = vsc;
 	port = pci_vtcon_vq_to_port(sc, vq);
 
 	if (!port->vsp_rx_ready) {
 		port->vsp_rx_ready = 1;
-		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+		vq_kick_disable(vq);
 	}
 }
 
 static int
 pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	struct pci_vtcon_softc *sc;
 	char *portname = NULL;
 	char *portpath = NULL;
 	char *opt;
 	int i;	
 
 	sc = calloc(1, sizeof(struct pci_vtcon_softc));
 	sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config));
 	sc->vsc_config->max_nr_ports = VTCON_MAXPORTS;
 	sc->vsc_config->cols = 80;
 	sc->vsc_config->rows = 25; 
 
 	vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues);
 	sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	for (i = 0; i < VTCON_MAXQ; i++) {
 		sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ;
 		sc->vsc_queues[i].vq_notify = i % 2 == 0
 		    ? pci_vtcon_notify_rx
 		    : pci_vtcon_notify_tx;
 	}
 
 	/* initialize config space */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_CONSOLE);
 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
 		return (1);
 	vi_set_io_bar(&sc->vsc_vs, 0);
 
 	/* create control port */
 	sc->vsc_control_port.vsp_sc = sc;
 	sc->vsc_control_port.vsp_txq = 2;
 	sc->vsc_control_port.vsp_rxq = 3;
 	sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx;
 	sc->vsc_control_port.vsp_enabled = true;
 
 	while ((opt = strsep(&opts, ",")) != NULL) {
 		portname = strsep(&opt, "=");
 		portpath = opt;
 
 		/* create port */
 		if (pci_vtcon_sock_add(sc, portname, portpath) < 0) {
 			fprintf(stderr, "cannot create port %s: %s\n",
 			    portname, strerror(errno));
 			return (1);
 		}
 	}
 
 	return (0);
 }
 
 struct pci_devemu pci_de_vcon = {
 	.pe_emu =	"virtio-console",
 	.pe_init =	pci_vtcon_init,
 	.pe_barwrite =	vi_pci_write,
 	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vcon);
Index: head/usr.sbin/bhyve/pci_virtio_net.c
===================================================================
--- head/usr.sbin/bhyve/pci_virtio_net.c	(revision 348928)
+++ head/usr.sbin/bhyve/pci_virtio_net.c	(revision 348929)
@@ -1,968 +1,966 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/linker_set.h>
 #include <sys/select.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
-#include <machine/atomic.h>
 #include <net/ethernet.h>
 #ifndef NETMAP_WITH_LIBS
 #define NETMAP_WITH_LIBS
 #endif
 #include <net/netmap_user.h>
 
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <md5.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <sysexits.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "mevent.h"
 #include "virtio.h"
 
 #define VTNET_RINGSZ	1024
 
 #define VTNET_MAXSEGS	256
 
 /*
  * Host capabilities.  Note that we only offer a few of these.
  */
 #define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
 #define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
 #define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
 #define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
 #define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
 #define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
 #define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
 #define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
 #define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
 #define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
 #define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
 #define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
 #define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
 #define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
 #define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
 #define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
 #define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
 #define	VIRTIO_NET_F_GUEST_ANNOUNCE \
 				(1 << 21) /* guest can send gratuitous pkts */
 
 #define VTNET_S_HOSTCAPS      \
   ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
     VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
 
 /*
  * PCI config-space "registers"
  */
 struct virtio_net_config {
 	uint8_t  mac[6];
 	uint16_t status;
 } __packed;
 
 /*
  * Queue definitions.
  */
 #define VTNET_RXQ	0
 #define VTNET_TXQ	1
 #define VTNET_CTLQ	2	/* NB: not yet supported */
 
 #define VTNET_MAXQ	3
 
 /*
  * Fixed network header size
  */
 struct virtio_net_rxhdr {
 	uint8_t		vrh_flags;
 	uint8_t		vrh_gso_type;
 	uint16_t	vrh_hdr_len;
 	uint16_t	vrh_gso_size;
 	uint16_t	vrh_csum_start;
 	uint16_t	vrh_csum_offset;
 	uint16_t	vrh_bufs;
 } __packed;
 
 /*
  * Debug printf
  */
 static int pci_vtnet_debug;
 #define DPRINTF(params) if (pci_vtnet_debug) printf params
 #define WPRINTF(params) printf params
 
 /*
  * Per-device softc
  */
 struct pci_vtnet_softc {
 	struct virtio_softc vsc_vs;
 	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
 	pthread_mutex_t vsc_mtx;
 	struct mevent	*vsc_mevp;
 
 	int		vsc_tapfd;
 	struct nm_desc	*vsc_nmd;
 
 	int		vsc_rx_ready;
 	int		resetting;	/* protected by tx_mtx */
 
 	uint64_t	vsc_features;	/* negotiated features */
 	
 	struct virtio_net_config vsc_config;
 
 	pthread_mutex_t	rx_mtx;
 	int		rx_vhdrlen;
 	int		rx_merge;	/* merged rx bufs in use */
 
 	pthread_t 	tx_tid;
 	pthread_mutex_t	tx_mtx;
 	pthread_cond_t	tx_cond;
 	int		tx_in_progress;
 
 	void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
 	void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
 			     int iovcnt, int len);
 };
 
 static void pci_vtnet_reset(void *);
 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */
 static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
 static void pci_vtnet_neg_features(void *, uint64_t);
 
 static struct virtio_consts vtnet_vi_consts = {
 	"vtnet",		/* our name */
 	VTNET_MAXQ - 1,		/* we currently support 2 virtqueues */
 	sizeof(struct virtio_net_config), /* config reg size */
 	pci_vtnet_reset,	/* reset */
 	NULL,			/* device-wide qnotify -- not used */
 	pci_vtnet_cfgread,	/* read PCI config */
 	pci_vtnet_cfgwrite,	/* write PCI config */
 	pci_vtnet_neg_features,	/* apply negotiated features */
 	VTNET_S_HOSTCAPS,	/* our capabilities */
 };
 
 static void
 pci_vtnet_reset(void *vsc)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	DPRINTF(("vtnet: device reset requested !\n"));
 
 	/* Acquire the RX lock to block RX processing. */
 	pthread_mutex_lock(&sc->rx_mtx);
 
 	/* Set sc->resetting and give a chance to the TX thread to stop. */
 	pthread_mutex_lock(&sc->tx_mtx);
 	sc->resetting = 1;
 	while (sc->tx_in_progress) {
 		pthread_mutex_unlock(&sc->tx_mtx);
 		usleep(10000);
 		pthread_mutex_lock(&sc->tx_mtx);
 	}
 
 	sc->vsc_rx_ready = 0;
 	sc->rx_merge = 1;
 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
 
 	/*
 	 * Now reset rings, MSI-X vectors, and negotiated capabilities.
 	 * Do that with the TX lock held, since we need to reset
 	 * sc->resetting.
 	 */
 	vi_reset_dev(&sc->vsc_vs);
 
 	sc->resetting = 0;
 	pthread_mutex_unlock(&sc->tx_mtx);
 	pthread_mutex_unlock(&sc->rx_mtx);
 }
 
 /*
  * Called to send a buffer chain out to the tap device
  */
 static void
 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
 		 int len)
 {
 	static char pad[60]; /* all zero bytes */
 
 	if (sc->vsc_tapfd == -1)
 		return;
 
 	/*
 	 * If the length is < 60, pad out to that and add the
 	 * extra zero'd segment to the iov. It is guaranteed that
 	 * there is always an extra iov available by the caller.
 	 */
 	if (len < 60) {
 		iov[iovcnt].iov_base = pad;
 		iov[iovcnt].iov_len = 60 - len;
 		iovcnt++;
 	}
 	(void) writev(sc->vsc_tapfd, iov, iovcnt);
 }
 
 /*
  *  Called when there is read activity on the tap file descriptor.
  * Each buffer posted by the guest is assumed to be able to contain
  * an entire ethernet frame + rx header.
  *  MP note: the dummybuf is only used for discarding frames, so there
  * is no need for it to be per-vtnet or locked.
  */
 static uint8_t dummybuf[2048];
 
 static __inline struct iovec *
 rx_iov_trim(struct iovec *iov, int *niov, int tlen)
 {
 	struct iovec *riov;
 
 	/* XXX short-cut: assume first segment is >= tlen */
 	assert(iov[0].iov_len >= tlen);
 
 	iov[0].iov_len -= tlen;
 	if (iov[0].iov_len == 0) {
 		assert(*niov > 1);
 		*niov -= 1;
 		riov = &iov[1];
 	} else {
 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
 		riov = &iov[0];
 	}
 
 	return (riov);
 }
 
 static void
 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 {
 	struct iovec iov[VTNET_MAXSEGS], *riov;
 	struct vqueue_info *vq;
 	void *vrx;
 	int len, n;
 	uint16_t idx;
 
 	/*
 	 * Should never be called without a valid tap fd
 	 */
 	assert(sc->vsc_tapfd != -1);
 
 	/*
 	 * But, will be called when the rx ring hasn't yet
 	 * been set up.
 	 */
 	if (!sc->vsc_rx_ready) {
 		/*
 		 * Drop the packet and try later.
 		 */
 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
 		return;
 	}
 
 	/*
 	 * Check for available rx buffers
 	 */
 	vq = &sc->vsc_queues[VTNET_RXQ];
 	if (!vq_has_descs(vq)) {
 		/*
 		 * Drop the packet and try later.  Interrupt on
 		 * empty, if that's negotiated.
 		 */
 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
 		vq_endchains(vq, 1);
 		return;
 	}
 
 	do {
 		/*
 		 * Get descriptor chain.
 		 */
 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
 		assert(n >= 1 && n <= VTNET_MAXSEGS);
 
 		/*
 		 * Get a pointer to the rx header, and use the
 		 * data immediately following it for the packet buffer.
 		 */
 		vrx = iov[0].iov_base;
 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
 
 		len = readv(sc->vsc_tapfd, riov, n);
 
 		if (len < 0 && errno == EWOULDBLOCK) {
 			/*
 			 * No more packets, but still some avail ring
 			 * entries.  Interrupt if needed/appropriate.
 			 */
 			vq_retchain(vq);
 			vq_endchains(vq, 0);
 			return;
 		}
 
 		/*
 		 * The only valid field in the rx packet header is the
 		 * number of buffers if merged rx bufs were negotiated.
 		 */
 		memset(vrx, 0, sc->rx_vhdrlen);
 
 		if (sc->rx_merge) {
 			struct virtio_net_rxhdr *vrxh;
 
 			vrxh = vrx;
 			vrxh->vrh_bufs = 1;
 		}
 
 		/*
 		 * Release this chain and handle more chains.
 		 */
 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
 	} while (vq_has_descs(vq));
 
 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
 	vq_endchains(vq, 1);
 }
 
 static __inline int
 pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
 {
 	int r, i;
 	int len = 0;
 
 	for (r = nmd->cur_tx_ring; ; ) {
 		struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
 		uint32_t cur, idx;
 		char *buf;
 
 		if (nm_ring_empty(ring)) {
 			r++;
 			if (r > nmd->last_tx_ring)
 				r = nmd->first_tx_ring;
 			if (r == nmd->cur_tx_ring)
 				break;
 			continue;
 		}
 		cur = ring->cur;
 		idx = ring->slot[cur].buf_idx;
 		buf = NETMAP_BUF(ring, idx);
 
 		for (i = 0; i < iovcnt; i++) {
 			if (len + iov[i].iov_len > 2048)
 				break;
 			memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
 			len += iov[i].iov_len;
 		}
 		ring->slot[cur].len = len;
 		ring->head = ring->cur = nm_ring_next(ring, cur);
 		nmd->cur_tx_ring = r;
 		ioctl(nmd->fd, NIOCTXSYNC, NULL);
 		break;
 	}
 
 	return (len);
 }
 
 static __inline int
 pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
 {
 	int len = 0;
 	int i = 0;
 	int r;
 
 	for (r = nmd->cur_rx_ring; ; ) {
 		struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
 		uint32_t cur, idx;
 		char *buf;
 		size_t left;
 
 		if (nm_ring_empty(ring)) {
 			r++;
 			if (r > nmd->last_rx_ring)
 				r = nmd->first_rx_ring;
 			if (r == nmd->cur_rx_ring)
 				break;
 			continue;
 		}
 		cur = ring->cur;
 		idx = ring->slot[cur].buf_idx;
 		buf = NETMAP_BUF(ring, idx);
 		left = ring->slot[cur].len;
 
 		for (i = 0; i < iovcnt && left > 0; i++) {
 			if (iov[i].iov_len > left)
 				iov[i].iov_len = left;
 			memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
 			len += iov[i].iov_len;
 			left -= iov[i].iov_len;
 		}
 		ring->head = ring->cur = nm_ring_next(ring, cur);
 		nmd->cur_rx_ring = r;
 		ioctl(nmd->fd, NIOCRXSYNC, NULL);
 		break;
 	}
 	for (; i < iovcnt; i++)
 		iov[i].iov_len = 0;
 
 	return (len);
 }
 
 /*
  * Called to send a buffer chain out to the vale port
  */
 static void
 pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
 		    int len)
 {
 	static char pad[60]; /* all zero bytes */
 
 	if (sc->vsc_nmd == NULL)
 		return;
 
 	/*
 	 * If the length is < 60, pad out to that and add the
 	 * extra zero'd segment to the iov. It is guaranteed that
 	 * there is always an extra iov available by the caller.
 	 */
 	if (len < 60) {
 		iov[iovcnt].iov_base = pad;
 		iov[iovcnt].iov_len = 60 - len;
 		iovcnt++;
 	}
 	(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
 }
 
 static void
 pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
 {
 	struct iovec iov[VTNET_MAXSEGS], *riov;
 	struct vqueue_info *vq;
 	void *vrx;
 	int len, n;
 	uint16_t idx;
 
 	/*
 	 * Should never be called without a valid netmap descriptor
 	 */
 	assert(sc->vsc_nmd != NULL);
 
 	/*
 	 * But, will be called when the rx ring hasn't yet
 	 * been set up.
 	 */
 	if (!sc->vsc_rx_ready) {
 		/*
 		 * Drop the packet and try later.
 		 */
 		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
 		return;
 	}
 
 	/*
 	 * Check for available rx buffers
 	 */
 	vq = &sc->vsc_queues[VTNET_RXQ];
 	if (!vq_has_descs(vq)) {
 		/*
 		 * Drop the packet and try later.  Interrupt on
 		 * empty, if that's negotiated.
 		 */
 		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
 		vq_endchains(vq, 1);
 		return;
 	}
 
 	do {
 		/*
 		 * Get descriptor chain.
 		 */
 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
 		assert(n >= 1 && n <= VTNET_MAXSEGS);
 
 		/*
 		 * Get a pointer to the rx header, and use the
 		 * data immediately following it for the packet buffer.
 		 */
 		vrx = iov[0].iov_base;
 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
 
 		len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
 
 		if (len == 0) {
 			/*
 			 * No more packets, but still some avail ring
 			 * entries.  Interrupt if needed/appropriate.
 			 */
 			vq_retchain(vq);
 			vq_endchains(vq, 0);
 			return;
 		}
 
 		/*
 		 * The only valid field in the rx packet header is the
 		 * number of buffers if merged rx bufs were negotiated.
 		 */
 		memset(vrx, 0, sc->rx_vhdrlen);
 
 		if (sc->rx_merge) {
 			struct virtio_net_rxhdr *vrxh;
 
 			vrxh = vrx;
 			vrxh->vrh_bufs = 1;
 		}
 
 		/*
 		 * Release this chain and handle more chains.
 		 */
 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
 	} while (vq_has_descs(vq));
 
 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
 	vq_endchains(vq, 1);
 }
 
 static void
 pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
 {
 	struct pci_vtnet_softc *sc = param;
 
 	pthread_mutex_lock(&sc->rx_mtx);
 	sc->pci_vtnet_rx(sc);
 	pthread_mutex_unlock(&sc->rx_mtx);
 
 }
 
 static void
 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	/*
 	 * A qnotify means that the rx process can now begin
 	 */
 	if (sc->vsc_rx_ready == 0) {
 		sc->vsc_rx_ready = 1;
-		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+		vq_kick_disable(vq);
 	}
 }
 
 static void
 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
 {
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	int i, n;
 	int plen, tlen;
 	uint16_t idx;
 
 	/*
 	 * Obtain chain of descriptors.  The first one is
 	 * really the header descriptor, so we need to sum
 	 * up two lengths: packet length and transfer length.
 	 */
 	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
 	assert(n >= 1 && n <= VTNET_MAXSEGS);
 	plen = 0;
 	tlen = iov[0].iov_len;
 	for (i = 1; i < n; i++) {
 		plen += iov[i].iov_len;
 		tlen += iov[i].iov_len;
 	}
 
 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
 	sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
 
 	/* chain is processed, release it and set tlen */
 	vq_relchain(vq, idx, tlen);
 }
 
 static void
 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	/*
 	 * Any ring entries to process?
 	 */
 	if (!vq_has_descs(vq))
 		return;
 
 	/* Signal the tx thread for processing */
 	pthread_mutex_lock(&sc->tx_mtx);
-	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+	vq_kick_disable(vq);
 	if (sc->tx_in_progress == 0)
 		pthread_cond_signal(&sc->tx_cond);
 	pthread_mutex_unlock(&sc->tx_mtx);
 }
 
 /*
  * Thread which will handle processing of TX desc
  */
 static void *
 pci_vtnet_tx_thread(void *param)
 {
 	struct pci_vtnet_softc *sc = param;
 	struct vqueue_info *vq;
 	int error;
 
 	vq = &sc->vsc_queues[VTNET_TXQ];
 
 	/*
 	 * Let us wait till the tx queue pointers get initialised &
 	 * first tx signaled
 	 */
 	pthread_mutex_lock(&sc->tx_mtx);
 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
 	assert(error == 0);
 
 	for (;;) {
 		/* note - tx mutex is locked here */
 		while (sc->resetting || !vq_has_descs(vq)) {
-			vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
-			mb();
+			vq_kick_enable(vq);
 			if (!sc->resetting && vq_has_descs(vq))
 				break;
 
 			sc->tx_in_progress = 0;
 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
 			assert(error == 0);
 		}
-		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+		vq_kick_disable(vq);
 		sc->tx_in_progress = 1;
 		pthread_mutex_unlock(&sc->tx_mtx);
 
 		do {
 			/*
 			 * Run through entries, placing them into
 			 * iovecs and sending when an end-of-packet
 			 * is found
 			 */
 			pci_vtnet_proctx(sc, vq);
 		} while (vq_has_descs(vq));
 
 		/*
 		 * Generate an interrupt if needed.
 		 */
 		vq_endchains(vq, 1);
 
 		pthread_mutex_lock(&sc->tx_mtx);
 	}
 }
 
 #ifdef notyet
 static void
 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
 {
 
 	DPRINTF(("vtnet: control qnotify!\n\r"));
 }
 #endif
 
 static int
 pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
 {
 	struct ether_addr *ea;
 	char *tmpstr;
 	char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
 
 	tmpstr = strsep(&mac_str,"=");
 
 	if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
 		ea = ether_aton(mac_str);
 
 		if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
 		    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
 			fprintf(stderr, "Invalid MAC %s\n", mac_str);
 			return (EINVAL);
 		} else
 			memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
 	}
 
 	return (0);
 }
 
 static void
 pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
 {
 	char tbuf[80];
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 #endif
 
 	strcpy(tbuf, "/dev/");
 	strlcat(tbuf, devname, sizeof(tbuf));
 
 	sc->pci_vtnet_rx = pci_vtnet_tap_rx;
 	sc->pci_vtnet_tx = pci_vtnet_tap_tx;
 
 	sc->vsc_tapfd = open(tbuf, O_RDWR);
 	if (sc->vsc_tapfd == -1) {
 		WPRINTF(("open of tap device %s failed\n", tbuf));
 		return;
 	}
 
 	/*
 	 * Set non-blocking and register for read
 	 * notifications with the event loop
 	 */
 	int opt = 1;
 	if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
 		WPRINTF(("tap device O_NONBLOCK failed\n"));
 		close(sc->vsc_tapfd);
 		sc->vsc_tapfd = -1;
 	}
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
 	if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
 				  EVF_READ,
 				  pci_vtnet_rx_callback,
 				  sc);
 	if (sc->vsc_mevp == NULL) {
 		WPRINTF(("Could not register event\n"));
 		close(sc->vsc_tapfd);
 		sc->vsc_tapfd = -1;
 	}
 }
 
 static void
 pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
 {
 	sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
 	sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
 
 	sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
 	if (sc->vsc_nmd == NULL) {
 		WPRINTF(("open of netmap device %s failed\n", ifname));
 		return;
 	}
 
 	sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
 				  EVF_READ,
 				  pci_vtnet_rx_callback,
 				  sc);
 	if (sc->vsc_mevp == NULL) {
 		WPRINTF(("Could not register event\n"));
 		nm_close(sc->vsc_nmd);
 		sc->vsc_nmd = NULL;
 	}
 }
 
 static int
 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	MD5_CTX mdctx;
 	unsigned char digest[16];
 	char nstr[80];
 	char tname[MAXCOMLEN + 1];
 	struct pci_vtnet_softc *sc;
 	char *devname;
 	char *vtopts;
 	int mac_provided;
 
 	sc = calloc(1, sizeof(struct pci_vtnet_softc));
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
 	sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
 	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
 	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
 	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
 #ifdef notyet
 	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
         sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
 #endif
  
 	/*
 	 * Attempt to open the tap device and read the MAC address
 	 * if specified
 	 */
 	mac_provided = 0;
 	sc->vsc_tapfd = -1;
 	sc->vsc_nmd = NULL;
 	if (opts != NULL) {
 		int err;
 
 		devname = vtopts = strdup(opts);
 		(void) strsep(&vtopts, ",");
 
 		if (vtopts != NULL) {
 			err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
 			if (err != 0) {
 				free(devname);
 				return (err);
 			}
 			mac_provided = 1;
 		}
 
 		if (strncmp(devname, "vale", 4) == 0)
 			pci_vtnet_netmap_setup(sc, devname);
 		if (strncmp(devname, "tap", 3) == 0 ||
 		    strncmp(devname, "vmnet", 5) == 0)
 			pci_vtnet_tap_setup(sc, devname);
 
 		free(devname);
 	}
 
 	/*
 	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
 	 * followed by an MD5 of the PCI slot/func number and dev name
 	 */
 	if (!mac_provided) {
 		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
 		    pi->pi_func, vmname);
 
 		MD5Init(&mdctx);
 		MD5Update(&mdctx, nstr, strlen(nstr));
 		MD5Final(digest, &mdctx);
 
 		sc->vsc_config.mac[0] = 0x00;
 		sc->vsc_config.mac[1] = 0xa0;
 		sc->vsc_config.mac[2] = 0x98;
 		sc->vsc_config.mac[3] = digest[0];
 		sc->vsc_config.mac[4] = digest[1];
 		sc->vsc_config.mac[5] = digest[2];
 	}
 
 	/* initialize config space */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	/* Link is up if we managed to open tap device or vale port. */
 	sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
 	    sc->vsc_nmd != NULL);
 	
 	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
 	if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
 		return (1);
 
 	/* use BAR 0 to map config regs in IO space */
 	vi_set_io_bar(&sc->vsc_vs, 0);
 
 	sc->resetting = 0;
 
 	sc->rx_merge = 1;
 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
 	pthread_mutex_init(&sc->rx_mtx, NULL); 
 
 	/* 
 	 * Initialize tx semaphore & spawn TX processing thread.
 	 * As of now, only one thread for TX desc processing is
 	 * spawned. 
 	 */
 	sc->tx_in_progress = 0;
 	pthread_mutex_init(&sc->tx_mtx, NULL);
 	pthread_cond_init(&sc->tx_cond, NULL);
 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
 	snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
 	    pi->pi_func);
 	pthread_set_name_np(sc->tx_tid, tname);
 
 	return (0);
 }
 
 static int
 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 {
 	struct pci_vtnet_softc *sc = vsc;
 	void *ptr;
 
 	if (offset < 6) {
 		assert(offset + size <= 6);
 		/*
 		 * The driver is allowed to change the MAC address
 		 */
 		ptr = &sc->vsc_config.mac[offset];
 		memcpy(ptr, &value, size);
 	} else {
 		/* silently ignore other writes */
 		DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
 	}
 
 	return (0);
 }
 
 static int
 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtnet_softc *sc = vsc;
 	void *ptr;
 
 	ptr = (uint8_t *)&sc->vsc_config + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 static void
 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	sc->vsc_features = negotiated_features;
 
 	if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
 		sc->rx_merge = 0;
 		/* non-merge rx header is 2 bytes shorter */
 		sc->rx_vhdrlen -= 2;
 	}
 }
 
 struct pci_devemu pci_de_vnet = {
 	.pe_emu = 	"virtio-net",
 	.pe_init =	pci_vtnet_init,
 	.pe_barwrite =	vi_pci_write,
 	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vnet);
Index: head/usr.sbin/bhyve/pci_virtio_scsi.c
===================================================================
--- head/usr.sbin/bhyve/pci_virtio_scsi.c	(revision 348928)
+++ head/usr.sbin/bhyve/pci_virtio_scsi.c	(revision 348929)
@@ -1,737 +1,737 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
  * Copyright (c) 2018 Marcelo Araujo <araujo@FreeBSD.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <sys/time.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_util.h>
 #include <cam/ctl/ctl_scsi_all.h>
 #include <camlib.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "virtio.h"
 #include "iov.h"
 
 #define VTSCSI_RINGSZ		64
 #define	VTSCSI_REQUESTQ		1
 #define	VTSCSI_THR_PER_Q	16
 #define	VTSCSI_MAXQ		(VTSCSI_REQUESTQ + 2)
 #define	VTSCSI_MAXSEG		64
 
 #define	VTSCSI_IN_HEADER_LEN(_sc)	\
 	(sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size)
 
 #define	VTSCSI_OUT_HEADER_LEN(_sc) 	\
 	(sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size)
 
 #define	VIRTIO_SCSI_MAX_CHANNEL	0
 #define	VIRTIO_SCSI_MAX_TARGET	0
 #define	VIRTIO_SCSI_MAX_LUN	16383
 
 #define	VIRTIO_SCSI_F_INOUT	(1 << 0)
 #define	VIRTIO_SCSI_F_HOTPLUG	(1 << 1)
 #define	VIRTIO_SCSI_F_CHANGE	(1 << 2)
 
 static int pci_vtscsi_debug = 0;
 #define	DPRINTF(params) if (pci_vtscsi_debug) printf params
 #define	WPRINTF(params) printf params
 
 struct pci_vtscsi_config {
 	uint32_t num_queues;
 	uint32_t seg_max;
 	uint32_t max_sectors;
 	uint32_t cmd_per_lun;
 	uint32_t event_info_size;
 	uint32_t sense_size;
 	uint32_t cdb_size;
 	uint16_t max_channel;
 	uint16_t max_target;
 	uint32_t max_lun;
 } __attribute__((packed));
 
 struct pci_vtscsi_queue {
 	struct pci_vtscsi_softc *         vsq_sc;
 	struct vqueue_info *              vsq_vq;
 	pthread_mutex_t                   vsq_mtx;
 	pthread_mutex_t                   vsq_qmtx;
 	pthread_cond_t                    vsq_cv;
 	STAILQ_HEAD(, pci_vtscsi_request) vsq_requests;
 	LIST_HEAD(, pci_vtscsi_worker)    vsq_workers;
 };
 
 struct pci_vtscsi_worker {
 	struct pci_vtscsi_queue *     vsw_queue;
 	pthread_t                     vsw_thread;
 	bool                          vsw_exiting;
 	LIST_ENTRY(pci_vtscsi_worker) vsw_link;
 };
 
 struct pci_vtscsi_request {
 	struct pci_vtscsi_queue * vsr_queue;
 	struct iovec              vsr_iov_in[VTSCSI_MAXSEG];
 	int                       vsr_niov_in;
 	struct iovec              vsr_iov_out[VTSCSI_MAXSEG];
 	int                       vsr_niov_out;
 	uint32_t                  vsr_idx;
 	STAILQ_ENTRY(pci_vtscsi_request) vsr_link;
 };
 
 /*
  * Per-device softc
  */
 struct pci_vtscsi_softc {
 	struct virtio_softc      vss_vs;
 	struct vqueue_info       vss_vq[VTSCSI_MAXQ];
 	struct pci_vtscsi_queue  vss_queues[VTSCSI_REQUESTQ];
 	pthread_mutex_t          vss_mtx;
 	int                      vss_iid;
 	int                      vss_ctl_fd;
 	uint32_t                 vss_features;
 	struct pci_vtscsi_config vss_config;
 };
 
 #define	VIRTIO_SCSI_T_TMF			0
 #define	VIRTIO_SCSI_T_TMF_ABORT_TASK		0
 #define	VIRTIO_SCSI_T_TMF_ABORT_TASK_SET	1
 #define	VIRTIO_SCSI_T_TMF_CLEAR_ACA		2
 #define	VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET	3
 #define	VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET	4
 #define	VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET	5
 #define	VIRTIO_SCSI_T_TMF_QUERY_TASK		6
 #define	VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 	7
 
 /* command-specific response values */
 #define	VIRTIO_SCSI_S_FUNCTION_COMPLETE		0
 #define	VIRTIO_SCSI_S_FUNCTION_SUCCEEDED	10
 #define	VIRTIO_SCSI_S_FUNCTION_REJECTED		11
 
 struct pci_vtscsi_ctrl_tmf {
 	uint32_t type;
 	uint32_t subtype;
 	uint8_t lun[8];
 	uint64_t id;
 	uint8_t response;
 } __attribute__((packed));
 
 #define	VIRTIO_SCSI_T_AN_QUERY			1
 #define	VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2
 #define	VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT	4
 #define	VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST	8
 #define	VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE	16
 #define	VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST	32
 #define	VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY	64
 
 struct pci_vtscsi_ctrl_an {
 	uint32_t type;
 	uint8_t lun[8];
 	uint32_t event_requested;
 	uint32_t event_actual;
 	uint8_t response;
 } __attribute__((packed));
 
 /* command-specific response values */
 #define	VIRTIO_SCSI_S_OK 			0
 #define	VIRTIO_SCSI_S_OVERRUN			1
 #define	VIRTIO_SCSI_S_ABORTED			2
 #define	VIRTIO_SCSI_S_BAD_TARGET		3
 #define	VIRTIO_SCSI_S_RESET			4
 #define	VIRTIO_SCSI_S_BUSY			5
 #define	VIRTIO_SCSI_S_TRANSPORT_FAILURE		6
 #define	VIRTIO_SCSI_S_TARGET_FAILURE		7
 #define	VIRTIO_SCSI_S_NEXUS_FAILURE		8
 #define	VIRTIO_SCSI_S_FAILURE			9
 #define	VIRTIO_SCSI_S_INCORRECT_LUN		12
 
 /* task_attr */
 #define	VIRTIO_SCSI_S_SIMPLE			0
 #define	VIRTIO_SCSI_S_ORDERED			1
 #define	VIRTIO_SCSI_S_HEAD			2
 #define	VIRTIO_SCSI_S_ACA			3
 
 struct pci_vtscsi_event {
 	uint32_t event;
 	uint8_t lun[8];
 	uint32_t reason;
 } __attribute__((packed));
 
 struct pci_vtscsi_req_cmd_rd {
 	uint8_t lun[8];
 	uint64_t id;
 	uint8_t task_attr;
 	uint8_t prio;
 	uint8_t crn;
 	uint8_t cdb[];
 } __attribute__((packed));
 
 struct pci_vtscsi_req_cmd_wr {
 	uint32_t sense_len;
 	uint32_t residual;
 	uint16_t status_qualifier;
 	uint8_t status;
 	uint8_t response;
 	uint8_t sense[];
 } __attribute__((packed));
 
 static void *pci_vtscsi_proc(void *);
 static void pci_vtscsi_reset(void *);
 static void pci_vtscsi_neg_features(void *, uint64_t);
 static int pci_vtscsi_cfgread(void *, int, int, uint32_t *);
 static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t);
 static inline int pci_vtscsi_get_lun(uint8_t *);
 static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t);
 static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *,
     struct pci_vtscsi_ctrl_tmf *);
 static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *,
     struct pci_vtscsi_ctrl_an *);
 static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *,
     int, struct iovec *, int);
 static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *);
 static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *);
 static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *);
 static int  pci_vtscsi_init_queue(struct pci_vtscsi_softc *,
     struct pci_vtscsi_queue *, int);
 static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, char *);
 
 static struct virtio_consts vtscsi_vi_consts = {
 	"vtscsi",				/* our name */
 	VTSCSI_MAXQ,				/* we support 2+n virtqueues */
 	sizeof(struct pci_vtscsi_config),	/* config reg size */
 	pci_vtscsi_reset,			/* reset */
 	NULL,					/* device-wide qnotify */
 	pci_vtscsi_cfgread,			/* read virtio config */
 	pci_vtscsi_cfgwrite,			/* write virtio config */
 	pci_vtscsi_neg_features,		/* apply negotiated features */
 	0,					/* our capabilities */
 };
 
 static void *
 pci_vtscsi_proc(void *arg)
 {
 	struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg;
 	struct pci_vtscsi_queue *q = worker->vsw_queue;
 	struct pci_vtscsi_request *req;
 	int iolen;
 
 	for (;;) {
 		pthread_mutex_lock(&q->vsq_mtx);
 
 		while (STAILQ_EMPTY(&q->vsq_requests)
 		    && !worker->vsw_exiting)
 			pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx);
 
 		if (worker->vsw_exiting)
 			break;
 
 		req = STAILQ_FIRST(&q->vsq_requests);
 		STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link);
 
 		pthread_mutex_unlock(&q->vsq_mtx);
 		iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in,
 		    req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out);
 
 		pthread_mutex_lock(&q->vsq_qmtx);
 		vq_relchain(q->vsq_vq, req->vsr_idx, iolen);
 		vq_endchains(q->vsq_vq, 0);
 		pthread_mutex_unlock(&q->vsq_qmtx);
 
 		DPRINTF(("virtio-scsi: request <idx=%d> completed\n",
 		    req->vsr_idx));
 		free(req);
 	}
 
 	pthread_mutex_unlock(&q->vsq_mtx);
 	return (NULL);
 }
 
 static void
 pci_vtscsi_reset(void *vsc)
 {
 	struct pci_vtscsi_softc *sc;
 
 	sc = vsc;
 
 	DPRINTF(("vtscsi: device reset requested\n"));
 	vi_reset_dev(&sc->vss_vs);
 
 	/* initialize config structure */
 	sc->vss_config = (struct pci_vtscsi_config){
 		.num_queues = VTSCSI_REQUESTQ,
 		.seg_max = VTSCSI_MAXSEG,
 		.max_sectors = 2,
 		.cmd_per_lun = 1,
 		.event_info_size = sizeof(struct pci_vtscsi_event),
 		.sense_size = 96,
 		.cdb_size = 32,
 		.max_channel = VIRTIO_SCSI_MAX_CHANNEL,
 		.max_target = VIRTIO_SCSI_MAX_TARGET,
 		.max_lun = VIRTIO_SCSI_MAX_LUN
 	};
 }
 
 static void
 pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features)
 {
 	struct pci_vtscsi_softc *sc = vsc;
 
 	sc->vss_features = negotiated_features;
 }
 
 static int
 pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtscsi_softc *sc = vsc;
 	void *ptr;
 
 	ptr = (uint8_t *)&sc->vss_config + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 static int
 pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val)
 {
 
 	return (0);
 }
 
 static inline int
 pci_vtscsi_get_lun(uint8_t *lun)
 {
 
 	return (((lun[2] << 8) | lun[3]) & 0x3fff);
 }
 
 static int
 pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf,
     size_t bufsize)
 {
 	struct pci_vtscsi_ctrl_tmf *tmf;
 	struct pci_vtscsi_ctrl_an *an;
 	uint32_t type;
 
 	type = *(uint32_t *)buf;
 
 	if (type == VIRTIO_SCSI_T_TMF) {
 		tmf = (struct pci_vtscsi_ctrl_tmf *)buf;
 		return (pci_vtscsi_tmf_handle(sc, tmf));
 	}
 
 	if (type == VIRTIO_SCSI_T_AN_QUERY) {
 		an = (struct pci_vtscsi_ctrl_an *)buf;
 		return (pci_vtscsi_an_handle(sc, an));
 	}
 
 	return (0);
 }
 
 static int
 pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc,
     struct pci_vtscsi_ctrl_tmf *tmf)
 {
 	union ctl_io *io;
 	int err;
 
 	io = ctl_scsi_alloc_io(sc->vss_iid);
 	ctl_scsi_zero_io(io);
 
 	io->io_hdr.io_type = CTL_IO_TASK;
 	io->io_hdr.nexus.initid = sc->vss_iid;
 	io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun);
 	io->taskio.tag_type = CTL_TAG_SIMPLE;
 	io->taskio.tag_num = (uint32_t)tmf->id;
 
 	switch (tmf->subtype) {
 	case VIRTIO_SCSI_T_TMF_ABORT_TASK:
 		io->taskio.task_action = CTL_TASK_ABORT_TASK;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
 		io->taskio.task_action = CTL_TASK_ABORT_TASK_SET;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_CLEAR_ACA:
 		io->taskio.task_action = CTL_TASK_CLEAR_ACA;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
 		io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
 		io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
 		io->taskio.task_action = CTL_TASK_LUN_RESET;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_QUERY_TASK:
 		io->taskio.task_action = CTL_TASK_QUERY_TASK;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET:
 		io->taskio.task_action = CTL_TASK_QUERY_TASK_SET;
 		break;
 	}
 
 	if (pci_vtscsi_debug) {
 		struct sbuf *sb = sbuf_new_auto();
 		ctl_io_sbuf(io, sb);
 		sbuf_finish(sb);
 		DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb)));
 		sbuf_delete(sb);
 	}
 
 	err = ioctl(sc->vss_ctl_fd, CTL_IO, io);
 	if (err != 0)
 		WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno)));
 
 	tmf->response = io->taskio.task_status;
 	ctl_scsi_free_io(io);
 	return (1);
 }
 
 static int
 pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc,
     struct pci_vtscsi_ctrl_an *an)
 {
 
 	return (0);
 }
 
 static int
 pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in,
     int niov_in, struct iovec *iov_out, int niov_out)
 {
 	struct pci_vtscsi_softc *sc = q->vsq_sc;
 	struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL;
 	struct pci_vtscsi_req_cmd_wr *cmd_wr;
 	struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG];
 	union ctl_io *io;
 	int data_niov_in, data_niov_out;
 	void *ext_data_ptr = NULL;
 	uint32_t ext_data_len = 0, ext_sg_entries = 0;
 	int err;
 
 	seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in,
 	    VTSCSI_IN_HEADER_LEN(sc));
 	seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out,
 	    VTSCSI_OUT_HEADER_LEN(sc));
 
 	truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc));
 	truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc));
 	iov_to_buf(iov_in, niov_in, (void **)&cmd_rd);
 
 	cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc));
 	io = ctl_scsi_alloc_io(sc->vss_iid);
 	ctl_scsi_zero_io(io);
 
 	io->io_hdr.nexus.initid = sc->vss_iid;
 	io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun);
 
 	io->io_hdr.io_type = CTL_IO_SCSI;
 
 	if (data_niov_in > 0) {
 		ext_data_ptr = (void *)data_iov_in;
 		ext_sg_entries = data_niov_in;
 		ext_data_len = count_iov(data_iov_in, data_niov_in);
 		io->io_hdr.flags |= CTL_FLAG_DATA_OUT;
 	} else if (data_niov_out > 0) {
 		ext_data_ptr = (void *)data_iov_out;
 		ext_sg_entries = data_niov_out;
 		ext_data_len = count_iov(data_iov_out, data_niov_out);
 		io->io_hdr.flags |= CTL_FLAG_DATA_IN;
 	}
 
 	io->scsiio.sense_len = sc->vss_config.sense_size;
 	io->scsiio.tag_num = (uint32_t)cmd_rd->id;
 	switch (cmd_rd->task_attr) {
 	case VIRTIO_SCSI_S_ORDERED:
 		io->scsiio.tag_type = CTL_TAG_ORDERED;
 		break;
 	case VIRTIO_SCSI_S_HEAD:
 		io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE;
 		break;
 	case VIRTIO_SCSI_S_ACA:
 		io->scsiio.tag_type = CTL_TAG_ACA;
 		break;
 	case VIRTIO_SCSI_S_SIMPLE:
 	default:
 		io->scsiio.tag_type = CTL_TAG_SIMPLE;
 		break;
 	}
 	io->scsiio.ext_sg_entries = ext_sg_entries;
 	io->scsiio.ext_data_ptr = ext_data_ptr;
 	io->scsiio.ext_data_len = ext_data_len;
 	io->scsiio.ext_data_filled = 0;
 	io->scsiio.cdb_len = sc->vss_config.cdb_size;
 	memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size);
 
 	if (pci_vtscsi_debug) {
 		struct sbuf *sb = sbuf_new_auto();
 		ctl_io_sbuf(io, sb);
 		sbuf_finish(sb);
 		DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb)));
 		sbuf_delete(sb);
 	}
 
 	err = ioctl(sc->vss_ctl_fd, CTL_IO, io);
 	if (err != 0) {
 		WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno)));
 		cmd_wr->response = VIRTIO_SCSI_S_FAILURE;
 	} else {
 		cmd_wr->sense_len = MIN(io->scsiio.sense_len,
 		    sc->vss_config.sense_size);
 		cmd_wr->residual = io->scsiio.residual;
 		cmd_wr->status = io->scsiio.scsi_status;
 		cmd_wr->response = VIRTIO_SCSI_S_OK;
 		memcpy(&cmd_wr->sense, &io->scsiio.sense_data,
 		    cmd_wr->sense_len);
 	}
 
 	buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0);
 	free(cmd_rd);
 	free(cmd_wr);
 	ctl_scsi_free_io(io);
 	return (VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled);
 }
 
 static void
 pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtscsi_softc *sc;
 	struct iovec iov[VTSCSI_MAXSEG];
 	uint16_t idx, n;
 	void *buf = NULL;
 	size_t bufsize;
 	int iolen;
 
 	sc = vsc;
 
 	while (vq_has_descs(vq)) {
 		n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL);
 		bufsize = iov_to_buf(iov, n, &buf);
 		iolen = pci_vtscsi_control_handle(sc, buf, bufsize);
 		buf_to_iov(buf + bufsize - iolen, iolen, iov, n,
 		    bufsize - iolen);
 
 		/*
 		 * Release this chain and handle more
 		 */
 		vq_relchain(vq, idx, iolen);
 	}
 	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
 	free(buf);
 }
 
 static void
 pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq)
 {
 
-	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+	vq_kick_disable(vq);
 }
 
 static void
 pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtscsi_softc *sc;
 	struct pci_vtscsi_queue *q;
 	struct pci_vtscsi_request *req;
 	struct iovec iov[VTSCSI_MAXSEG];
 	uint16_t flags[VTSCSI_MAXSEG];
 	uint16_t idx, n, i;
 	int readable;
 
 	sc = vsc;
 	q = &sc->vss_queues[vq->vq_num - 2];
 
 	while (vq_has_descs(vq)) {
 		readable = 0;
 		n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags);
 
 		/* Count readable descriptors */
 		for (i = 0; i < n; i++) {
 			if (flags[i] & VRING_DESC_F_WRITE)
 				break;
 
 			readable++;
 		}
 
 		req = calloc(1, sizeof(struct pci_vtscsi_request));
 		req->vsr_idx = idx;
 		req->vsr_queue = q;
 		req->vsr_niov_in = readable;
 		req->vsr_niov_out = n - readable;
 		memcpy(req->vsr_iov_in, iov,
 		    req->vsr_niov_in * sizeof(struct iovec));
 		memcpy(req->vsr_iov_out, iov + readable,
 		    req->vsr_niov_out * sizeof(struct iovec));
 
 		pthread_mutex_lock(&q->vsq_mtx);
 		STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link);
 		pthread_cond_signal(&q->vsq_cv);
 		pthread_mutex_unlock(&q->vsq_mtx);
 
 		DPRINTF(("virtio-scsi: request <idx=%d> enqueued\n", idx));
 	}
 }
 
 static int
 pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, 
     struct pci_vtscsi_queue *queue, int num)
 {
 	struct pci_vtscsi_worker *worker;
 	char tname[MAXCOMLEN + 1];
 	int i;
 
 	queue->vsq_sc = sc;
 	queue->vsq_vq = &sc->vss_vq[num + 2];
 
 	pthread_mutex_init(&queue->vsq_mtx, NULL);
 	pthread_mutex_init(&queue->vsq_qmtx, NULL);
 	pthread_cond_init(&queue->vsq_cv, NULL);
 	STAILQ_INIT(&queue->vsq_requests);
 	LIST_INIT(&queue->vsq_workers);
 
 	for (i = 0; i < VTSCSI_THR_PER_Q; i++) {
 		worker = calloc(1, sizeof(struct pci_vtscsi_worker));
 		worker->vsw_queue = queue;
 
 		pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc,
 		    (void *)worker);
 
 		snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i);
 		pthread_set_name_np(worker->vsw_thread, tname);
 		LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link);
 	}
 
 	return (0);
 }
 
 static int
 pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	struct pci_vtscsi_softc *sc;
 	char *opt, *optname;
 	const char *devname;
 	int i, optidx = 0;
 
 	sc = calloc(1, sizeof(struct pci_vtscsi_softc));
 	devname = "/dev/cam/ctl";
 	while ((opt = strsep(&opts, ",")) != NULL) {
 		optname = strsep(&opt, "=");
 		if (opt == NULL && optidx == 0) {
 			if (optname[0] != 0)
 				devname = optname;
 		} else if (strcmp(optname, "dev") == 0 && opt != NULL) {
 			devname = opt;
 		} else if (strcmp(optname, "iid") == 0 && opt != NULL) {
 			sc->vss_iid = strtoul(opt, NULL, 10);
 		} else {
 			fprintf(stderr, "Invalid option %s\n", optname);
 			free(sc);
 			return (1);
 		}
 		optidx++;
 	}
 
 	sc->vss_ctl_fd = open(devname, O_RDWR);
 	if (sc->vss_ctl_fd < 0) {
 		WPRINTF(("cannot open %s: %s\n", devname, strerror(errno)));
 		free(sc);
 		return (1);
 	}
 
 	vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq);
 	sc->vss_vs.vs_mtx = &sc->vss_mtx;
 
 	/* controlq */
 	sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ;
 	sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify;
 
 	/* eventq */
 	sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ;
 	sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify;
 
 	/* request queues */
 	for (i = 2; i < VTSCSI_MAXQ; i++) {
 		sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ;
 		sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify;
 		pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2);
 	}
 
 	/* initialize config space */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI);
 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix()))
 		return (1);
 	vi_set_io_bar(&sc->vss_vs, 0);
 
 	return (0);
 }
 
 
 struct pci_devemu pci_de_vscsi = {
 	.pe_emu =	"virtio-scsi",
 	.pe_init =	pci_vtscsi_init,
 	.pe_barwrite =	vi_pci_write,
 	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vscsi);
Index: head/usr.sbin/bhyve/virtio.c
===================================================================
--- head/usr.sbin/bhyve/virtio.c	(revision 348928)
+++ head/usr.sbin/bhyve/virtio.c	(revision 348929)
@@ -1,795 +1,796 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
  * Copyright (c) 2019 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/uio.h>
 
 #include <machine/atomic.h>
 
 #include <stdio.h>
 #include <stdint.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "virtio.h"
 
 /*
  * Functions for dealing with generalized "virtual devices" as
  * defined by <https://www.google.com/#output=search&q=virtio+spec>
  */
 
 /*
  * In case we decide to relax the "virtio softc comes at the
  * front of virtio-based device softc" constraint, let's use
  * this to convert.
  */
 #define	DEV_SOFTC(vs) ((void *)(vs))
 
 /*
  * Link a virtio_softc to its constants, the device softc, and
  * the PCI emulation.
  */
 void
 vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
 		void *dev_softc, struct pci_devinst *pi,
 		struct vqueue_info *queues)
 {
 	int i;
 
 	/* vs and dev_softc addresses must match */
 	assert((void *)vs == dev_softc);
 	vs->vs_vc = vc;
 	vs->vs_pi = pi;
 	pi->pi_arg = vs;
 
 	vs->vs_queues = queues;
 	for (i = 0; i < vc->vc_nvq; i++) {
 		queues[i].vq_vs = vs;
 		queues[i].vq_num = i;
 	}
 }
 
 /*
  * Reset device (device-wide).  This erases all queues, i.e.,
  * all the queues become invalid (though we don't wipe out the
  * internal pointers, we just clear the VQ_ALLOC flag).
  *
  * It resets negotiated features to "none".
  *
  * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
  */
 void
 vi_reset_dev(struct virtio_softc *vs)
 {
 	struct vqueue_info *vq;
 	int i, nvq;
 
 	if (vs->vs_mtx)
 		assert(pthread_mutex_isowned_np(vs->vs_mtx));
 
 	nvq = vs->vs_vc->vc_nvq;
 	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
 		vq->vq_flags = 0;
 		vq->vq_last_avail = 0;
 		vq->vq_save_used = 0;
 		vq->vq_pfn = 0;
 		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
 	}
 	vs->vs_negotiated_caps = 0;
 	vs->vs_curq = 0;
 	/* vs->vs_status = 0; -- redundant */
 	if (vs->vs_isr)
 		pci_lintr_deassert(vs->vs_pi);
 	vs->vs_isr = 0;
 	vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
 }
 
 /*
  * Set I/O BAR (usually 0) to map PCI config registers.
  */
 void
 vi_set_io_bar(struct virtio_softc *vs, int barnum)
 {
 	size_t size;
 
 	/*
 	 * ??? should we use CFG0 if MSI-X is disabled?
 	 * Existing code did not...
 	 */
 	size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
 	pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
 }
 
 /*
  * Initialize MSI-X vector capabilities if we're to use MSI-X,
  * or MSI capabilities if not.
  *
  * We assume we want one MSI-X vector per queue, here, plus one
  * for the config vec.
  */
 int
 vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
 {
 	int nvec;
 
 	if (use_msix) {
 		vs->vs_flags |= VIRTIO_USE_MSIX;
 		VS_LOCK(vs);
 		vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
 		VS_UNLOCK(vs);
 		nvec = vs->vs_vc->vc_nvq + 1;
 		if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
 			return (1);
 	} else
 		vs->vs_flags &= ~VIRTIO_USE_MSIX;
 
 	/* Only 1 MSI vector for bhyve */
 	pci_emul_add_msicap(vs->vs_pi, 1);
 
 	/* Legacy interrupts are mandatory for virtio devices */
 	pci_lintr_request(vs->vs_pi);
 
 	return (0);
 }
 
 /*
  * Initialize the currently-selected virtio queue (vs->vs_curq).
  * The guest just gave us a page frame number, from which we can
  * calculate the addresses of the queue.
  */
 void
 vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
 {
 	struct vqueue_info *vq;
 	uint64_t phys;
 	size_t size;
 	char *base;
 
 	vq = &vs->vs_queues[vs->vs_curq];
 	vq->vq_pfn = pfn;
 	phys = (uint64_t)pfn << VRING_PFN;
 	size = vring_size(vq->vq_qsize);
 	base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
 
 	/* First page(s) are descriptors... */
 	vq->vq_desc = (struct virtio_desc *)base;
 	base += vq->vq_qsize * sizeof(struct virtio_desc);
 
 	/* ... immediately followed by "avail" ring (entirely uint16_t's) */
 	vq->vq_avail = (struct vring_avail *)base;
 	base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
 
 	/* Then it's rounded up to the next page... */
 	base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
 
 	/* ... and the last page(s) are the used ring. */
 	vq->vq_used = (struct vring_used *)base;
 
 	/* Mark queue as allocated, and start at 0 when we use it. */
 	vq->vq_flags = VQ_ALLOC;
 	vq->vq_last_avail = 0;
 	vq->vq_save_used = 0;
 }
 
 /*
  * Helper inline for vq_getchain(): record the i'th "real"
  * descriptor.
  */
 static inline void
 _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
 	   struct iovec *iov, int n_iov, uint16_t *flags) {
 
 	if (i >= n_iov)
 		return;
 	iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
 	iov[i].iov_len = vd->vd_len;
 	if (flags != NULL)
 		flags[i] = vd->vd_flags;
 }
 #define	VQ_MAX_DESCRIPTORS	512	/* see below */
 
 /*
  * Examine the chain of descriptors starting at the "next one" to
  * make sure that they describe a sensible request.  If so, return
  * the number of "real" descriptors that would be needed/used in
  * acting on this request.  This may be smaller than the number of
  * available descriptors, e.g., if there are two available but
  * they are two separate requests, this just returns 1.  Or, it
  * may be larger: if there are indirect descriptors involved,
  * there may only be one descriptor available but it may be an
  * indirect pointing to eight more.  We return 8 in this case,
  * i.e., we do not count the indirect descriptors, only the "real"
  * ones.
  *
  * Basically, this vets the vd_flags and vd_next field of each
  * descriptor and tells you how many are involved.  Since some may
  * be indirect, this also needs the vmctx (in the pci_devinst
  * at vs->vs_pi) so that it can find indirect descriptors.
  *
  * As we process each descriptor, we copy and adjust it (guest to
  * host address wise, also using the vmtctx) into the given iov[]
  * array (of the given size).  If the array overflows, we stop
  * placing values into the array but keep processing descriptors,
  * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
  * So you, the caller, must not assume that iov[] is as big as the
  * return value (you can process the same thing twice to allocate
  * a larger iov array if needed, or supply a zero length to find
  * out how much space is needed).
  *
  * If you want to verify the WRITE flag on each descriptor, pass a
  * non-NULL "flags" pointer to an array of "uint16_t" of the same size
  * as n_iov and we'll copy each vd_flags field after unwinding any
  * indirects.
  *
  * If some descriptor(s) are invalid, this prints a diagnostic message
  * and returns -1.  If no descriptors are ready now it simply returns 0.
  *
  * You are assumed to have done a vq_ring_ready() if needed (note
  * that vq_has_descs() does one).
  */
 int
 vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 	    struct iovec *iov, int n_iov, uint16_t *flags)
 {
 	int i;
 	u_int ndesc, n_indir;
 	u_int idx, next;
 	volatile struct virtio_desc *vdir, *vindir, *vp;
 	struct vmctx *ctx;
 	struct virtio_softc *vs;
 	const char *name;
 
 	vs = vq->vq_vs;
 	name = vs->vs_vc->vc_name;
 
 	/*
 	 * Note: it's the responsibility of the guest not to
 	 * update vq->vq_avail->va_idx until all of the descriptors
          * the guest has written are valid (including all their
          * vd_next fields and vd_flags).
 	 *
 	 * Compute (last_avail - va_idx) in integers mod 2**16.  This is
 	 * the number of descriptors the device has made available
 	 * since the last time we updated vq->vq_last_avail.
 	 *
 	 * We just need to do the subtraction as an unsigned int,
 	 * then trim off excess bits.
 	 */
 	idx = vq->vq_last_avail;
 	ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
 	if (ndesc == 0)
 		return (0);
 	if (ndesc > vq->vq_qsize) {
 		/* XXX need better way to diagnose issues */
 		fprintf(stderr,
 		    "%s: ndesc (%u) out of range, driver confused?\r\n",
 		    name, (u_int)ndesc);
 		return (-1);
 	}
 
 	/*
 	 * Now count/parse "involved" descriptors starting from
 	 * the head of the chain.
 	 *
 	 * To prevent loops, we could be more complicated and
 	 * check whether we're re-visiting a previously visited
 	 * index, but we just abort if the count gets excessive.
 	 */
 	ctx = vs->vs_pi->pi_vmctx;
 	*pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
 	vq->vq_last_avail++;
 	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
 		if (next >= vq->vq_qsize) {
 			fprintf(stderr,
 			    "%s: descriptor index %u out of range, "
 			    "driver confused?\r\n",
 			    name, next);
 			return (-1);
 		}
 		vdir = &vq->vq_desc[next];
 		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
 			_vq_record(i, vdir, ctx, iov, n_iov, flags);
 			i++;
 		} else if ((vs->vs_vc->vc_hv_caps &
 		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
 			fprintf(stderr,
 			    "%s: descriptor has forbidden INDIRECT flag, "
 			    "driver confused?\r\n",
 			    name);
 			return (-1);
 		} else {
 			n_indir = vdir->vd_len / 16;
 			if ((vdir->vd_len & 0xf) || n_indir == 0) {
 				fprintf(stderr,
 				    "%s: invalid indir len 0x%x, "
 				    "driver confused?\r\n",
 				    name, (u_int)vdir->vd_len);
 				return (-1);
 			}
 			vindir = paddr_guest2host(ctx,
 			    vdir->vd_addr, vdir->vd_len);
 			/*
 			 * Indirects start at the 0th, then follow
 			 * their own embedded "next"s until those run
 			 * out.  Each one's indirect flag must be off
 			 * (we don't really have to check, could just
 			 * ignore errors...).
 			 */
 			next = 0;
 			for (;;) {
 				vp = &vindir[next];
 				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
 					fprintf(stderr,
 					    "%s: indirect desc has INDIR flag,"
 					    " driver confused?\r\n",
 					    name);
 					return (-1);
 				}
 				_vq_record(i, vp, ctx, iov, n_iov, flags);
 				if (++i > VQ_MAX_DESCRIPTORS)
 					goto loopy;
 				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
 					break;
 				next = vp->vd_next;
 				if (next >= n_indir) {
 					fprintf(stderr,
 					    "%s: invalid next %u > %u, "
 					    "driver confused?\r\n",
 					    name, (u_int)next, n_indir);
 					return (-1);
 				}
 			}
 		}
 		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
 			return (i);
 	}
 loopy:
 	fprintf(stderr,
 	    "%s: descriptor loop? count > %d - driver confused?\r\n",
 	    name, i);
 	return (-1);
 }
 
 /*
  * Return the currently-first request chain back to the available queue.
  *
  * (This chain is the one you handled when you called vq_getchain()
  * and used its positive return value.)
  */
 void
 vq_retchain(struct vqueue_info *vq)
 {
 
 	vq->vq_last_avail--;
 }
 
 /*
  * Return specified request chain to the guest, setting its I/O length
  * to the provided value.
  *
  * (This chain is the one you handled when you called vq_getchain()
  * and used its positive return value.)
  */
 void
 vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
 {
 	uint16_t uidx, mask;
 	volatile struct vring_used *vuh;
 	volatile struct virtio_used *vue;
 
 	/*
 	 * Notes:
 	 *  - mask is N-1 where N is a power of 2 so computes x % N
 	 *  - vuh points to the "used" data shared with guest
 	 *  - vue points to the "used" ring entry we want to update
 	 *  - head is the same value we compute in vq_iovecs().
 	 *
 	 * (I apologize for the two fields named vu_idx; the
 	 * virtio spec calls the one that vue points to, "id"...)
 	 */
 	mask = vq->vq_qsize - 1;
 	vuh = vq->vq_used;
 
 	uidx = vuh->vu_idx;
 	vue = &vuh->vu_ring[uidx++ & mask];
 	vue->vu_idx = idx;
 	vue->vu_tlen = iolen;
 
 	/*
 	 * Ensure the used descriptor is visible before updating the index.
-	 * This is necessary on ISAs with memory ordering less strict than x86.
+	 * This is necessary on ISAs with memory ordering less strict than x86
+	 * (and even on x86 to act as a compiler barrier).
 	 */
 	atomic_thread_fence_rel();
 	vuh->vu_idx = uidx;
 }
 
 /*
  * Driver has finished processing "available" chains and calling
  * vq_relchain on each one.  If driver used all the available
  * chains, used_all should be set.
  *
  * If the "used" index moved we may need to inform the guest, i.e.,
  * deliver an interrupt.  Even if the used index did NOT move we
  * may need to deliver an interrupt, if the avail ring is empty and
  * we are supposed to interrupt on empty.
  *
  * Note that used_all_avail is provided by the caller because it's
  * a snapshot of the ring state when he decided to finish interrupt
  * processing -- it's possible that descriptors became available after
  * that point.  (It's also typically a constant 1/True as well.)
  */
 void
 vq_endchains(struct vqueue_info *vq, int used_all_avail)
 {
 	struct virtio_softc *vs;
 	uint16_t event_idx, new_idx, old_idx;
 	int intr;
 
 	/*
 	 * Interrupt generation: if we're using EVENT_IDX,
 	 * interrupt if we've crossed the event threshold.
 	 * Otherwise interrupt is generated if we added "used" entries,
 	 * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
 	 *
 	 * In any case, though, if NOTIFY_ON_EMPTY is set and the
 	 * entire avail was processed, we need to interrupt always.
 	 */
 	vs = vq->vq_vs;
 	old_idx = vq->vq_save_used;
 	vq->vq_save_used = new_idx = vq->vq_used->vu_idx;
 
 	/*
 	 * Use full memory barrier between vu_idx store from preceding
 	 * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or
 	 * va_flags below.
 	 */
 	atomic_thread_fence_seq_cst();
 	if (used_all_avail &&
 	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
 		intr = 1;
 	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
 		event_idx = VQ_USED_EVENT_IDX(vq);
 		/*
 		 * This calculation is per docs and the kernel
 		 * (see src/sys/dev/virtio/virtio_ring.h).
 		 */
 		intr = (uint16_t)(new_idx - event_idx - 1) <
 			(uint16_t)(new_idx - old_idx);
 	} else {
 		intr = new_idx != old_idx &&
 		    !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
 	}
 	if (intr)
 		vq_interrupt(vs, vq);
 }
 
 /* Note: these are in sorted order to make for a fast search */
 static struct config_reg {
 	uint16_t	cr_offset;	/* register offset */
 	uint8_t		cr_size;	/* size (bytes) */
 	uint8_t		cr_ro;		/* true => reg is read only */
 	const char	*cr_name;	/* name of reg */
 } config_regs[] = {
 	{ VTCFG_R_HOSTCAP,	4, 1, "HOSTCAP" },
 	{ VTCFG_R_GUESTCAP,	4, 0, "GUESTCAP" },
 	{ VTCFG_R_PFN,		4, 0, "PFN" },
 	{ VTCFG_R_QNUM,		2, 1, "QNUM" },
 	{ VTCFG_R_QSEL,		2, 0, "QSEL" },
 	{ VTCFG_R_QNOTIFY,	2, 0, "QNOTIFY" },
 	{ VTCFG_R_STATUS,	1, 0, "STATUS" },
 	{ VTCFG_R_ISR,		1, 0, "ISR" },
 	{ VTCFG_R_CFGVEC,	2, 0, "CFGVEC" },
 	{ VTCFG_R_QVEC,		2, 0, "QVEC" },
 };
 
 static inline struct config_reg *
 vi_find_cr(int offset) {
 	u_int hi, lo, mid;
 	struct config_reg *cr;
 
 	lo = 0;
 	hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
 	while (hi >= lo) {
 		mid = (hi + lo) >> 1;
 		cr = &config_regs[mid];
 		if (cr->cr_offset == offset)
 			return (cr);
 		if (cr->cr_offset < offset)
 			lo = mid + 1;
 		else
 			hi = mid - 1;
 	}
 	return (NULL);
 }
 
 /*
  * Handle pci config space reads.
  * If it's to the MSI-X info, do that.
  * If it's part of the virtio standard stuff, do that.
  * Otherwise dispatch to the actual driver.
  */
 uint64_t
 vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	    int baridx, uint64_t offset, int size)
 {
 	struct virtio_softc *vs = pi->pi_arg;
 	struct virtio_consts *vc;
 	struct config_reg *cr;
 	uint64_t virtio_config_size, max;
 	const char *name;
 	uint32_t newoff;
 	uint32_t value;
 	int error;
 
 	if (vs->vs_flags & VIRTIO_USE_MSIX) {
 		if (baridx == pci_msix_table_bar(pi) ||
 		    baridx == pci_msix_pba_bar(pi)) {
 			return (pci_emul_msix_tread(pi, offset, size));
 		}
 	}
 
 	/* XXX probably should do something better than just assert() */
 	assert(baridx == 0);
 
 	if (vs->vs_mtx)
 		pthread_mutex_lock(vs->vs_mtx);
 
 	vc = vs->vs_vc;
 	name = vc->vc_name;
 	value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
 
 	if (size != 1 && size != 2 && size != 4)
 		goto bad;
 
 	if (pci_msix_enabled(pi))
 		virtio_config_size = VTCFG_R_CFG1;
 	else
 		virtio_config_size = VTCFG_R_CFG0;
 
 	if (offset >= virtio_config_size) {
 		/*
 		 * Subtract off the standard size (including MSI-X
 		 * registers if enabled) and dispatch to underlying driver.
 		 * If that fails, fall into general code.
 		 */
 		newoff = offset - virtio_config_size;
 		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
 		if (newoff + size > max)
 			goto bad;
 		error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
 		if (!error)
 			goto done;
 	}
 
 bad:
 	cr = vi_find_cr(offset);
 	if (cr == NULL || cr->cr_size != size) {
 		if (cr != NULL) {
 			/* offset must be OK, so size must be bad */
 			fprintf(stderr,
 			    "%s: read from %s: bad size %d\r\n",
 			    name, cr->cr_name, size);
 		} else {
 			fprintf(stderr,
 			    "%s: read from bad offset/size %jd/%d\r\n",
 			    name, (uintmax_t)offset, size);
 		}
 		goto done;
 	}
 
 	switch (offset) {
 	case VTCFG_R_HOSTCAP:
 		value = vc->vc_hv_caps;
 		break;
 	case VTCFG_R_GUESTCAP:
 		value = vs->vs_negotiated_caps;
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq < vc->vc_nvq)
 			value = vs->vs_queues[vs->vs_curq].vq_pfn;
 		break;
 	case VTCFG_R_QNUM:
 		value = vs->vs_curq < vc->vc_nvq ?
 		    vs->vs_queues[vs->vs_curq].vq_qsize : 0;
 		break;
 	case VTCFG_R_QSEL:
 		value = vs->vs_curq;
 		break;
 	case VTCFG_R_QNOTIFY:
 		value = 0;	/* XXX */
 		break;
 	case VTCFG_R_STATUS:
 		value = vs->vs_status;
 		break;
 	case VTCFG_R_ISR:
 		value = vs->vs_isr;
 		vs->vs_isr = 0;		/* a read clears this flag */
 		if (value)
 			pci_lintr_deassert(pi);
 		break;
 	case VTCFG_R_CFGVEC:
 		value = vs->vs_msix_cfg_idx;
 		break;
 	case VTCFG_R_QVEC:
 		value = vs->vs_curq < vc->vc_nvq ?
 		    vs->vs_queues[vs->vs_curq].vq_msix_idx :
 		    VIRTIO_MSI_NO_VECTOR;
 		break;
 	}
 done:
 	if (vs->vs_mtx)
 		pthread_mutex_unlock(vs->vs_mtx);
 	return (value);
 }
 
 /*
  * Handle pci config space writes.
  * If it's to the MSI-X info, do that.
  * If it's part of the virtio standard stuff, do that.
  * Otherwise dispatch to the actual driver.
  */
 void
 vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	     int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct virtio_softc *vs = pi->pi_arg;
 	struct vqueue_info *vq;
 	struct virtio_consts *vc;
 	struct config_reg *cr;
 	uint64_t virtio_config_size, max;
 	const char *name;
 	uint32_t newoff;
 	int error;
 
 	if (vs->vs_flags & VIRTIO_USE_MSIX) {
 		if (baridx == pci_msix_table_bar(pi) ||
 		    baridx == pci_msix_pba_bar(pi)) {
 			pci_emul_msix_twrite(pi, offset, size, value);
 			return;
 		}
 	}
 
 	/* XXX probably should do something better than just assert() */
 	assert(baridx == 0);
 
 	if (vs->vs_mtx)
 		pthread_mutex_lock(vs->vs_mtx);
 
 	vc = vs->vs_vc;
 	name = vc->vc_name;
 
 	if (size != 1 && size != 2 && size != 4)
 		goto bad;
 
 	if (pci_msix_enabled(pi))
 		virtio_config_size = VTCFG_R_CFG1;
 	else
 		virtio_config_size = VTCFG_R_CFG0;
 
 	if (offset >= virtio_config_size) {
 		/*
 		 * Subtract off the standard size (including MSI-X
 		 * registers if enabled) and dispatch to underlying driver.
 		 */
 		newoff = offset - virtio_config_size;
 		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
 		if (newoff + size > max)
 			goto bad;
 		error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
 		if (!error)
 			goto done;
 	}
 
 bad:
 	cr = vi_find_cr(offset);
 	if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
 		if (cr != NULL) {
 			/* offset must be OK, wrong size and/or reg is R/O */
 			if (cr->cr_size != size)
 				fprintf(stderr,
 				    "%s: write to %s: bad size %d\r\n",
 				    name, cr->cr_name, size);
 			if (cr->cr_ro)
 				fprintf(stderr,
 				    "%s: write to read-only reg %s\r\n",
 				    name, cr->cr_name);
 		} else {
 			fprintf(stderr,
 			    "%s: write to bad offset/size %jd/%d\r\n",
 			    name, (uintmax_t)offset, size);
 		}
 		goto done;
 	}
 
 	switch (offset) {
 	case VTCFG_R_GUESTCAP:
 		vs->vs_negotiated_caps = value & vc->vc_hv_caps;
 		if (vc->vc_apply_features)
 			(*vc->vc_apply_features)(DEV_SOFTC(vs),
 			    vs->vs_negotiated_caps);
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq >= vc->vc_nvq)
 			goto bad_qindex;
 		vi_vq_init(vs, value);
 		break;
 	case VTCFG_R_QSEL:
 		/*
 		 * Note that the guest is allowed to select an
 		 * invalid queue; we just need to return a QNUM
 		 * of 0 while the bad queue is selected.
 		 */
 		vs->vs_curq = value;
 		break;
 	case VTCFG_R_QNOTIFY:
 		if (value >= vc->vc_nvq) {
 			fprintf(stderr, "%s: queue %d notify out of range\r\n",
 				name, (int)value);
 			goto done;
 		}
 		vq = &vs->vs_queues[value];
 		if (vq->vq_notify)
 			(*vq->vq_notify)(DEV_SOFTC(vs), vq);
 		else if (vc->vc_qnotify)
 			(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
 		else
 			fprintf(stderr,
 			    "%s: qnotify queue %d: missing vq/vc notify\r\n",
 				name, (int)value);
 		break;
 	case VTCFG_R_STATUS:
 		vs->vs_status = value;
 		if (value == 0)
 			(*vc->vc_reset)(DEV_SOFTC(vs));
 		break;
 	case VTCFG_R_CFGVEC:
 		vs->vs_msix_cfg_idx = value;
 		break;
 	case VTCFG_R_QVEC:
 		if (vs->vs_curq >= vc->vc_nvq)
 			goto bad_qindex;
 		vq = &vs->vs_queues[vs->vs_curq];
 		vq->vq_msix_idx = value;
 		break;
 	}
 	goto done;
 
 bad_qindex:
 	fprintf(stderr,
 	    "%s: write config reg %s: curq %d >= max %d\r\n",
 	    name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
 done:
 	if (vs->vs_mtx)
 		pthread_mutex_unlock(vs->vs_mtx);
 }
Index: head/usr.sbin/bhyve/virtio.h
===================================================================
--- head/usr.sbin/bhyve/virtio.h	(revision 348928)
+++ head/usr.sbin/bhyve/virtio.h	(revision 348929)
@@ -1,468 +1,490 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_VIRTIO_H_
 #define	_VIRTIO_H_
 
+#include <machine/atomic.h>
+
 /*
  * These are derived from several virtio specifications.
  *
  * Some useful links:
  *    https://github.com/rustyrussell/virtio-spec
  *    http://people.redhat.com/pbonzini/virtio-spec.pdf
  */
 
 /*
  * A virtual device has zero or more "virtual queues" (virtqueue).
  * Each virtqueue uses at least two 4096-byte pages, laid out thus:
  *
  *      +-----------------------------------------------+
  *      |    "desc":  <N> descriptors, 16 bytes each    |
  *      |   -----------------------------------------   |
  *      |   "avail":   2 uint16; <N> uint16; 1 uint16   |
  *      |   -----------------------------------------   |
  *      |              pad to 4k boundary               |
  *      +-----------------------------------------------+
  *      |   "used": 2 x uint16; <N> elems; 1 uint16     |
  *      |   -----------------------------------------   |
  *      |              pad to 4k boundary               |
  *      +-----------------------------------------------+
  *
  * The number <N> that appears here is always a power of two and is
  * limited to no more than 32768 (as it must fit in a 16-bit field).
  * If <N> is sufficiently large, the above will occupy more than
  * two pages.  In any case, all pages must be physically contiguous
  * within the guest's physical address space.
  *
  * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
  * physical address <addr>, a 32-bit length <len>, a 16-bit
  * <flags>, and a 16-bit <next> field (all in guest byte order).
  *
  * There are three flags that may be set :
  *	NEXT    descriptor is chained, so use its "next" field
  *	WRITE   descriptor is for host to write into guest RAM
  *		(else host is to read from guest RAM)
  *	INDIRECT   descriptor address field is (guest physical)
  *		address of a linear array of descriptors
  *
  * Unless INDIRECT is set, <len> is the number of bytes that may
  * be read/written from guest physical address <addr>.  If
  * INDIRECT is set, WRITE is ignored and <len> provides the length
  * of the indirect descriptors (and <len> must be a multiple of
  * 16).  Note that NEXT may still be set in the main descriptor
  * pointing to the indirect, and should be set in each indirect
  * descriptor that uses the next descriptor (these should generally
  * be numbered sequentially).  However, INDIRECT must not be set
  * in the indirect descriptors.  Upon reaching an indirect descriptor
  * without a NEXT bit, control returns to the direct descriptors.
  *
  * Except inside an indirect, each <next> value must be in the
  * range [0 .. N) (i.e., the half-open interval).  (Inside an
  * indirect, each <next> must be in the range [0 .. <len>/16).)
  *
  * The "avail" data structures reside in the same pages as the
  * "desc" structures since both together are used by the device to
  * pass information to the hypervisor's virtual driver.  These
  * begin with a 16-bit <flags> field and 16-bit index <idx>, then
  * have <N> 16-bit <ring> values, followed by one final 16-bit
  * field <used_event>.  The <N> <ring> entries are simply indices
  * indices into the descriptor ring (and thus must meet the same
  * constraints as each <next> value).  However, <idx> is counted
  * up from 0 (initially) and simply wraps around after 65535; it
  * is taken mod <N> to find the next available entry.
  *
  * The "used" ring occupies a separate page or pages, and contains
  * values written from the virtual driver back to the guest OS.
  * This begins with a 16-bit <flags> and 16-bit <idx>, then there
  * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
  * The <N> "vring_used" elements consist of a 32-bit <id> and a
  * 32-bit <len> (vu_tlen below).  The <id> is simply the index of
  * the head of a descriptor chain the guest made available
  * earlier, and the <len> is the number of bytes actually written,
  * e.g., in the case of a network driver that provided a large
  * receive buffer but received only a small amount of data.
  *
  * The two event fields, <used_event> and <avail_event>, in the
  * avail and used rings (respectively -- note the reversal!), are
  * always provided, but are used only if the virtual device
  * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
  * negotiation.  Similarly, both rings provide a flag --
  * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
  * their <flags> field, indicating that the guest does not need an
  * interrupt, or that the hypervisor driver does not need a
  * notify, when descriptors are added to the corresponding ring.
  * (These are provided only for interrupt optimization and need
  * not be implemented.)
  */
 #define VRING_ALIGN	4096
 
 #define VRING_DESC_F_NEXT	(1 << 0)
 #define VRING_DESC_F_WRITE	(1 << 1)
 #define VRING_DESC_F_INDIRECT	(1 << 2)
 
 struct virtio_desc {			/* AKA vring_desc */
 	uint64_t	vd_addr;	/* guest physical address */
 	uint32_t	vd_len;		/* length of scatter/gather seg */
 	uint16_t	vd_flags;	/* VRING_F_DESC_* */
 	uint16_t	vd_next;	/* next desc if F_NEXT */
 } __packed;
 
 struct virtio_used {			/* AKA vring_used_elem */
 	uint32_t	vu_idx;		/* head of used descriptor chain */
 	uint32_t	vu_tlen;	/* length written-to */
 } __packed;
 
 #define VRING_AVAIL_F_NO_INTERRUPT   1
 
 struct vring_avail {
 	uint16_t	va_flags;	/* VRING_AVAIL_F_* */
 	uint16_t	va_idx;		/* counts to 65535, then cycles */
 	uint16_t	va_ring[];	/* size N, reported in QNUM value */
 /*	uint16_t	va_used_event;	-- after N ring entries */
 } __packed;
 
 #define	VRING_USED_F_NO_NOTIFY		1
 struct vring_used {
 	uint16_t	vu_flags;	/* VRING_USED_F_* */
 	uint16_t	vu_idx;		/* counts to 65535, then cycles */
 	struct virtio_used vu_ring[];	/* size N */
 /*	uint16_t	vu_avail_event;	-- after N ring entries */
 } __packed;
 
 /*
  * The address of any given virtual queue is determined by a single
  * Page Frame Number register.  The guest writes the PFN into the
  * PCI config space.  However, a device that has two or more
  * virtqueues can have a different PFN, and size, for each queue.
  * The number of queues is determinable via the PCI config space
  * VTCFG_R_QSEL register.  Writes to QSEL select the queue: 0 means
  * queue #0, 1 means queue#1, etc.  Once a queue is selected, the
  * remaining PFN and QNUM registers refer to that queue.
  *
  * QNUM is a read-only register containing a nonzero power of two
  * that indicates the (hypervisor's) queue size.  Or, if reading it
  * produces zero, the hypervisor does not have a corresponding
  * queue.  (The number of possible queues depends on the virtual
  * device.  The block device has just one; the network device
  * provides either two -- 0 = receive, 1 = transmit -- or three,
  * with 2 = control.)
  *
  * PFN is a read/write register giving the physical page address of
  * the virtqueue in guest memory (the guest must allocate enough space
  * based on the hypervisor's provided QNUM).
  *
  * QNOTIFY is effectively write-only: when the guest writes a queue
  * number to the register, the hypervisor should scan the specified
  * virtqueue. (Reading QNOTIFY currently always gets 0).
  */
 
 /*
  * PFN register shift amount
  */
 #define	VRING_PFN		12
 
 /*
  * Virtio device types
  *
  * XXX Should really be merged with <dev/virtio/virtio.h> defines
  */
 #define	VIRTIO_TYPE_NET		1
 #define	VIRTIO_TYPE_BLOCK	2
 #define	VIRTIO_TYPE_CONSOLE	3
 #define	VIRTIO_TYPE_ENTROPY	4
 #define	VIRTIO_TYPE_BALLOON	5
 #define	VIRTIO_TYPE_IOMEMORY	6
 #define	VIRTIO_TYPE_RPMSG	7
 #define	VIRTIO_TYPE_SCSI	8
 #define	VIRTIO_TYPE_9P		9
 
 /* experimental IDs start at 65535 and work down */
 
 /*
  * PCI vendor/device IDs
  */
 #define	VIRTIO_VENDOR		0x1AF4
 #define	VIRTIO_DEV_NET		0x1000
 #define	VIRTIO_DEV_BLOCK	0x1001
 #define	VIRTIO_DEV_CONSOLE	0x1003
 #define	VIRTIO_DEV_RANDOM	0x1005
 #define	VIRTIO_DEV_SCSI		0x1008
 
 /*
  * PCI config space constants.
  *
  * If MSI-X is enabled, the ISR register is generally not used,
  * and the configuration vector and queue vector appear at offsets
  * 20 and 22 with the remaining configuration registers at 24.
  * If MSI-X is not enabled, those two registers disappear and
  * the remaining configuration registers start at offset 20.
  */
 #define	VTCFG_R_HOSTCAP		0
 #define	VTCFG_R_GUESTCAP	4
 #define	VTCFG_R_PFN		8
 #define	VTCFG_R_QNUM		12
 #define	VTCFG_R_QSEL		14
 #define	VTCFG_R_QNOTIFY		16
 #define	VTCFG_R_STATUS		18
 #define	VTCFG_R_ISR		19
 #define	VTCFG_R_CFGVEC		20
 #define	VTCFG_R_QVEC		22
 #define	VTCFG_R_CFG0		20	/* No MSI-X */
 #define	VTCFG_R_CFG1		24	/* With MSI-X */
 #define	VTCFG_R_MSIX		20
 
 /*
  * Bits in VTCFG_R_STATUS.  Guests need not actually set any of these,
  * but a guest writing 0 to this register means "please reset".
  */
 #define	VTCFG_STATUS_ACK	0x01	/* guest OS has acknowledged dev */
 #define	VTCFG_STATUS_DRIVER	0x02	/* guest OS driver is loaded */
 #define	VTCFG_STATUS_DRIVER_OK	0x04	/* guest OS driver ready */
 #define	VTCFG_STATUS_FAILED	0x80	/* guest has given up on this dev */
 
 /*
  * Bits in VTCFG_R_ISR.  These apply only if not using MSI-X.
  *
  * (We don't [yet?] ever use CONF_CHANGED.)
  */
 #define	VTCFG_ISR_QUEUES	0x01	/* re-scan queues */
 #define	VTCFG_ISR_CONF_CHANGED	0x80	/* configuration changed */
 
 #define	VIRTIO_MSI_NO_VECTOR	0xFFFF
 
 /*
  * Feature flags.
  * Note: bits 0 through 23 are reserved to each device type.
  */
 #define	VIRTIO_F_NOTIFY_ON_EMPTY	(1 << 24)
 #define	VIRTIO_RING_F_INDIRECT_DESC	(1 << 28)
 #define	VIRTIO_RING_F_EVENT_IDX		(1 << 29)
 
 /* From section 2.3, "Virtqueue Configuration", of the virtio specification */
 static inline size_t
 vring_size(u_int qsz)
 {
 	size_t size;
 
 	/* constant 3 below = va_flags, va_idx, va_used_event */
 	size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz);
 	size = roundup2(size, VRING_ALIGN);
 
 	/* constant 3 below = vu_flags, vu_idx, vu_avail_event */
 	size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz;
 	size = roundup2(size, VRING_ALIGN);
 
 	return (size);
 }
 
 struct vmctx;
 struct pci_devinst;
 struct vqueue_info;
 
 /*
  * A virtual device, with some number (possibly 0) of virtual
  * queues and some size (possibly 0) of configuration-space
  * registers private to the device.  The virtio_softc should come
  * at the front of each "derived class", so that a pointer to the
  * virtio_softc is also a pointer to the more specific, derived-
  * from-virtio driver's softc.
  *
  * Note: inside each hypervisor virtio driver, changes to these
  * data structures must be locked against other threads, if any.
  * Except for PCI config space register read/write, we assume each
  * driver does the required locking, but we need a pointer to the
  * lock (if there is one) for PCI config space read/write ops.
  *
  * When the guest reads or writes the device's config space, the
  * generic layer checks for operations on the special registers
  * described above.  If the offset of the register(s) being read
  * or written is past the CFG area (CFG0 or CFG1), the request is
  * passed on to the virtual device, after subtracting off the
  * generic-layer size.  (So, drivers can just use the offset as
  * an offset into "struct config", for instance.)
  *
  * (The virtio layer also makes sure that the read or write is to/
  * from a "good" config offset, hence vc_cfgsize, and on BAR #0.
  * However, the driver must verify the read or write size and offset
  * and that no one is writing a readonly register.)
  *
  * The BROKED flag ("this thing done gone and broked") is for future
  * use.
  */
 #define	VIRTIO_USE_MSIX		0x01
 #define	VIRTIO_EVENT_IDX	0x02	/* use the event-index values */
 #define	VIRTIO_BROKED		0x08	/* ??? */
 
 struct virtio_softc {
 	struct virtio_consts *vs_vc;	/* constants (see below) */
 	int	vs_flags;		/* VIRTIO_* flags from above */
 	pthread_mutex_t *vs_mtx;	/* POSIX mutex, if any */
 	struct pci_devinst *vs_pi;	/* PCI device instance */
 	uint32_t vs_negotiated_caps;	/* negotiated capabilities */
 	struct vqueue_info *vs_queues;	/* one per vc_nvq */
 	int	vs_curq;		/* current queue */
 	uint8_t	vs_status;		/* value from last status write */
 	uint8_t	vs_isr;			/* ISR flags, if not MSI-X */
 	uint16_t vs_msix_cfg_idx;	/* MSI-X vector for config event */
 };
 
 #define	VS_LOCK(vs)							\
 do {									\
 	if (vs->vs_mtx)							\
 		pthread_mutex_lock(vs->vs_mtx);				\
 } while (0)
 
 #define	VS_UNLOCK(vs)							\
 do {									\
 	if (vs->vs_mtx)							\
 		pthread_mutex_unlock(vs->vs_mtx);			\
 } while (0)
 
 struct virtio_consts {
 	const char *vc_name;		/* name of driver (for diagnostics) */
 	int	vc_nvq;			/* number of virtual queues */
 	size_t	vc_cfgsize;		/* size of dev-specific config regs */
 	void	(*vc_reset)(void *);	/* called on virtual device reset */
 	void	(*vc_qnotify)(void *, struct vqueue_info *);
 					/* called on QNOTIFY if no VQ notify */
 	int	(*vc_cfgread)(void *, int, int, uint32_t *);
 					/* called to read config regs */
 	int	(*vc_cfgwrite)(void *, int, int, uint32_t);
 					/* called to write config regs */
 	void    (*vc_apply_features)(void *, uint64_t);
 				/* called to apply negotiated features */
 	uint64_t vc_hv_caps;		/* hypervisor-provided capabilities */
 };
 
 /*
  * Data structure allocated (statically) per virtual queue.
  *
  * Drivers may change vq_qsize after a reset.  When the guest OS
  * requests a device reset, the hypervisor first calls
  * vs->vs_vc->vc_reset(); then the data structure below is
  * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
  *
  * The remaining fields should only be fussed-with by the generic
  * code.
  *
  * Note: the addresses of vq_desc, vq_avail, and vq_used are all
  * computable from each other, but it's a lot simpler if we just
  * keep a pointer to each one.  The event indices are similarly
  * (but more easily) computable, and this time we'll compute them:
  * they're just XX_ring[N].
  */
 #define	VQ_ALLOC	0x01	/* set once we have a pfn */
 #define	VQ_BROKED	0x02	/* ??? */
 struct vqueue_info {
 	uint16_t vq_qsize;	/* size of this queue (a power of 2) */
 	void	(*vq_notify)(void *, struct vqueue_info *);
 				/* called instead of vc_notify, if not NULL */
 
 	struct virtio_softc *vq_vs;	/* backpointer to softc */
 	uint16_t vq_num;	/* we're the num'th queue in the softc */
 
 	uint16_t vq_flags;	/* flags (see above) */
 	uint16_t vq_last_avail;	/* a recent value of vq_avail->va_idx */
 	uint16_t vq_save_used;	/* saved vq_used->vu_idx; see vq_endchains */
 	uint16_t vq_msix_idx;	/* MSI-X index, or VIRTIO_MSI_NO_VECTOR */
 
 	uint32_t vq_pfn;	/* PFN of virt queue (not shifted!) */
 
 	volatile struct virtio_desc *vq_desc;	/* descriptor array */
 	volatile struct vring_avail *vq_avail;	/* the "avail" ring */
 	volatile struct vring_used *vq_used;	/* the "used" ring */
 
 };
 /* as noted above, these are sort of backwards, name-wise */
 #define VQ_AVAIL_EVENT_IDX(vq) \
 	(*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize])
 #define VQ_USED_EVENT_IDX(vq) \
 	((vq)->vq_avail->va_ring[(vq)->vq_qsize])
 
 /*
  * Is this ring ready for I/O?
  */
 static inline int
 vq_ring_ready(struct vqueue_info *vq)
 {
 
 	return (vq->vq_flags & VQ_ALLOC);
 }
 
 /*
  * Are there "available" descriptors?  (This does not count
  * how many, just returns True if there are some.)
  */
 static inline int
 vq_has_descs(struct vqueue_info *vq)
 {
 
 	return (vq_ring_ready(vq) && vq->vq_last_avail !=
 	    vq->vq_avail->va_idx);
 }
 
 /*
  * Deliver an interrupt to guest on the given virtual queue
  * (if possible, or a generic MSI interrupt if not using MSI-X).
  */
 static inline void
 vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
 {
 
 	if (pci_msix_enabled(vs->vs_pi))
 		pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
 	else {
 		VS_LOCK(vs);
 		vs->vs_isr |= VTCFG_ISR_QUEUES;
 		pci_generate_msi(vs->vs_pi, 0);
 		pci_lintr_assert(vs->vs_pi);
 		VS_UNLOCK(vs);
 	}
+}
+
+static inline void
+vq_kick_enable(struct vqueue_info *vq)
+{
+
+	vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
+	/*
+	 * Full memory barrier to make sure the store to vu_flags
+	 * happens before the load from va_idx, which results from
+	 * a subsequent call to vq_has_descs().
+	 */
+	atomic_thread_fence_seq_cst();
+}
+
+static inline void
+vq_kick_disable(struct vqueue_info *vq)
+{
+
+	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
 }
 
 struct iovec;
 void	vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
 			void *dev_softc, struct pci_devinst *pi,
 			struct vqueue_info *queues);
 int	vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
 void	vi_reset_dev(struct virtio_softc *);
 void	vi_set_io_bar(struct virtio_softc *, int);
 
 int	vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 		    struct iovec *iov, int n_iov, uint16_t *flags);
 void	vq_retchain(struct vqueue_info *vq);
 void	vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
 void	vq_endchains(struct vqueue_info *vq, int used_all_avail);
 
 uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		     int baridx, uint64_t offset, int size);
 void	vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		     int baridx, uint64_t offset, int size, uint64_t value);
 #endif	/* _VIRTIO_H_ */