Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile +++ usr.sbin/bhyve/Makefile @@ -4,6 +4,7 @@ .include CFLAGS+=-I${SRCTOP}/sys +CFLAGS+=-rdynamic .PATH: ${SRCTOP}/sys/cam/ctl PROG= bhyve @@ -20,6 +21,7 @@ bhyvegc.c \ bhyverun.c \ block_if.c \ + block_local.c \ bootrom.c \ console.c \ consport.c \ Index: usr.sbin/bhyve/bhyve.8 =================================================================== --- usr.sbin/bhyve/bhyve.8 +++ usr.sbin/bhyve/bhyve.8 @@ -274,10 +274,10 @@ format. .El .Pp -Block storage devices: +Local block storage devices: .Bl -tag -width 10n -.It Pa /filename Ns Oo , Ns Ar block-device-options Oc -.It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc +.It Oo Ar Ns file: Oc Ns Ar /filename Ns Oo , Ns Ar block-device-options Oc +.It Oo Ar Ns file: Oc Ns Ar /dev/xxx Ns Oo , Ns Ar block-device-options Oc .El .Pp The Index: usr.sbin/bhyve/block_if.h =================================================================== --- usr.sbin/bhyve/block_if.h +++ usr.sbin/bhyve/block_if.h @@ -38,9 +38,14 @@ #ifndef _BLOCK_IF_H_ #define _BLOCK_IF_H_ +#include + +#include #include #include +#include + /* * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in * a single request. BLOCKIF_RING_MAX is the maxmimum number of @@ -49,6 +54,26 @@ #define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */ #define BLOCKIF_RING_MAX 128 +#define BLOCKIF_SIG 0xb109b109 + +#define BLOCKIF_NUMTHR 8 +#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) + +enum blockop { + BOP_READ, + BOP_WRITE, + BOP_FLUSH, + BOP_DELETE +}; + +enum blockstat { + BST_FREE, + BST_BLOCK, + BST_PEND, + BST_BUSY, + BST_DONE +}; + struct blockif_req { int br_iovcnt; off_t br_offset; @@ -58,11 +83,44 @@ struct iovec br_iov[BLOCKIF_IOV_MAX]; }; -struct blockif_ctxt; -struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); +struct blockif_elem { + TAILQ_ENTRY(blockif_elem) be_link; + struct blockif_req *be_req; + enum blockop be_op; + enum blockstat be_status; + pthread_t be_tid; + off_t be_block; +}; + +struct blockif_ctxt { + int bc_magic; + /* For data specific for this instance of the backend*/ + intptr_t bc_desc; + int bc_ischr; + int bc_isgeom; + int bc_candelete; + int bc_rdonly; + off_t bc_size; + int bc_sectsz; + int bc_psectsz; + int bc_psectoff; + int bc_closing; + pthread_t bc_btid[BLOCKIF_NUMTHR]; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; + + struct block_backend *be; + + /* Request elements and free/pending/busy queues */ + TAILQ_HEAD(, blockif_elem) bc_freeq; + TAILQ_HEAD(, blockif_elem) bc_pendq; + TAILQ_HEAD(, blockif_elem) bc_busyq; + struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; +}; + +struct blockif_ctxt *blockif_open(char *optstr, const char *ident); off_t blockif_size(struct blockif_ctxt *bc); -void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, - uint8_t *s); +void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s); int blockif_sectsz(struct blockif_ctxt *bc); void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); int blockif_queuesz(struct blockif_ctxt *bc); @@ -74,5 +132,57 @@ int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_close(struct blockif_ctxt *bc); + +/* + * Each block device backend registers a set of function pointers that are + * used to implement the net backends API. + */ +struct block_backend { + const char *bb_name; /* identifier used parse the option string */ + /* + * Routines used to initialize and cleanup the resources needed + * by a backend. The init and cleanup function are used internally, + * and should not be called by the frontend. + */ + void (*bb_init)(void); + + void (*bb_cleanup)(struct blockif_ctxt *bc); + + struct blockif_ctxt * (*bb_open)(const char *optstr, const char *ident); + + off_t (*bb_size)(struct blockif_ctxt *bc); + + void (*bb_chs)(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, + uint8_t *s); + + int (*bb_sectsz)(struct blockif_ctxt *bc); + + void (*bb_psectsz)(struct blockif_ctxt *bc, int *size, int *off); + + int (*bb_queuesz)(struct blockif_ctxt *bc); + + int (*bb_is_ro)(struct blockif_ctxt *bc); + + int (*bb_candelete)(struct blockif_ctxt *bc); + + int (*bb_read)(struct blockif_ctxt *bc, struct blockif_req *breq); + + int (*bb_write)(struct blockif_ctxt *bc, struct blockif_req *breq); + + int (*bb_flush)(struct blockif_ctxt *bc, struct blockif_req *breq); + + int (*bb_delete)(struct blockif_ctxt *bc, struct blockif_req *breq); + + int (*bb_cancel)(struct blockif_ctxt *bc, struct blockif_req *breq); + + int (*bb_close)(struct blockif_ctxt *bc); + + /* Room for backend-specific data. */ + void *bb_opaque; +}; + +typedef struct block_backend block_backend_t; +SET_DECLARE(block_backend_set, block_backend_t); + #endif /* _BLOCK_IF_H_ */ Index: usr.sbin/bhyve/block_if.c =================================================================== --- usr.sbin/bhyve/block_if.c +++ usr.sbin/bhyve/block_if.c @@ -32,820 +32,183 @@ __FBSDID("$FreeBSD$"); #include -#ifndef WITHOUT_CAPSICUM -#include -#endif -#include -#include -#include -#include -#include - -#include -#ifndef WITHOUT_CAPSICUM -#include -#endif -#include -#include +#include +#include +#include +#include #include -#include #include -#include -#include -#include -#include -#include -#include - -#include "bhyverun.h" -#include "debug.h" -#include "mevent.h" #include "block_if.h" +#include "debug.h" -#define BLOCKIF_SIG 0xb109b109 - -#define BLOCKIF_NUMTHR 8 -#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) - -enum blockop { - BOP_READ, - BOP_WRITE, - BOP_FLUSH, - BOP_DELETE -}; - -enum blockstat { - BST_FREE, - BST_BLOCK, - BST_PEND, - BST_BUSY, - BST_DONE -}; - -struct blockif_elem { - TAILQ_ENTRY(blockif_elem) be_link; - struct blockif_req *be_req; - enum blockop be_op; - enum blockstat be_status; - pthread_t be_tid; - off_t be_block; -}; - -struct blockif_ctxt { - int bc_magic; - int bc_fd; - int bc_ischr; - int bc_isgeom; - int bc_candelete; - int bc_rdonly; - off_t bc_size; - int bc_sectsz; - int bc_psectsz; - int bc_psectoff; - int bc_closing; - pthread_t bc_btid[BLOCKIF_NUMTHR]; - pthread_mutex_t bc_mtx; - pthread_cond_t bc_cond; - - /* Request elements and free/pending/busy queues */ - TAILQ_HEAD(, blockif_elem) bc_freeq; - TAILQ_HEAD(, blockif_elem) bc_pendq; - TAILQ_HEAD(, blockif_elem) bc_busyq; - struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; -}; - -static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; - -struct blockif_sig_elem { - pthread_mutex_t bse_mtx; - pthread_cond_t bse_cond; - int bse_pending; - struct blockif_sig_elem *bse_next; -}; - -static struct blockif_sig_elem *blockif_bse_head; - -static int -blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, - enum blockop op) +struct blockif_ctxt * +blockif_open(char *optstr, const char *ident) { - struct blockif_elem *be, *tbe; - off_t off; - int i; + struct block_backend **bbe = NULL; + struct blockif_ctxt *ret = NULL; + char *optrest; /* if found, optstr without scheme: */ + char *backend_name; + regex_t re; + int res; - be = TAILQ_FIRST(&bc->bc_freeq); - assert(be != NULL); - assert(be->be_status == BST_FREE); - TAILQ_REMOVE(&bc->bc_freeq, be, be_link); - be->be_req = breq; - be->be_op = op; - switch (op) { - case BOP_READ: - case BOP_WRITE: - case BOP_DELETE: - off = breq->br_offset; - for (i = 0; i < breq->br_iovcnt; i++) - off += breq->br_iov[i].iov_len; - break; - default: - off = OFF_MAX; + backend_name = "file"; + optrest = strchr(optstr, ':'); + if (optrest != NULL) { + *optrest = '\0'; + backend_name = optstr; + optstr = ++optrest; } - be->be_block = off; - TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { - if (tbe->be_block == breq->br_offset) - break; + /* is optstr now a valid block driver string?? */ + res = regcomp (&re, "^[a-zA-Z0-9]+$" , REG_EXTENDED); + res = regexec (&re, backend_name, 0, NULL, 0); + regfree (&re); + if (res == REG_NOMATCH) { + EPRINTLN("%s is not a valid block device descriptor", backend_name); + return (NULL); } - if (tbe == NULL) { - TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { - if (tbe->be_block == breq->br_offset) - break; - } - } - if (tbe == NULL) - be->be_status = BST_PEND; - else - be->be_status = BST_BLOCK; - TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); - return (be->be_status == BST_PEND); -} -static int -blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) -{ - struct blockif_elem *be; - - TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { - if (be->be_status == BST_PEND) - break; - assert(be->be_status == BST_BLOCK); - } - if (be == NULL) - return (0); - TAILQ_REMOVE(&bc->bc_pendq, be, be_link); - be->be_status = BST_BUSY; - be->be_tid = t; - TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); - *bep = be; - return (1); -} - -static void -blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) -{ - struct blockif_elem *tbe; - - if (be->be_status == BST_DONE || be->be_status == BST_BUSY) - TAILQ_REMOVE(&bc->bc_busyq, be, be_link); - else - TAILQ_REMOVE(&bc->bc_pendq, be, be_link); - TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { - if (tbe->be_req->br_offset == be->be_block) - tbe->be_status = BST_PEND; - } - be->be_tid = 0; - be->be_status = BST_FREE; - be->be_req = NULL; - TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); -} - -static void -blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) -{ - struct blockif_req *br; - off_t arg[2]; - ssize_t clen, len, off, boff, voff; - int i, err; - - br = be->be_req; - if (br->br_iovcnt <= 1) - buf = NULL; - err = 0; - switch (be->be_op) { - case BOP_READ: - if (buf == NULL) { - if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, - br->br_offset)) < 0) - err = errno; - else - br->br_resid -= len; - break; - } - i = 0; - off = voff = 0; - while (br->br_resid > 0) { - len = MIN(br->br_resid, MAXPHYS); - if (pread(bc->bc_fd, buf, len, br->br_offset + - off) < 0) { - err = errno; - break; - } - boff = 0; - do { - clen = MIN(len - boff, br->br_iov[i].iov_len - - voff); - memcpy(br->br_iov[i].iov_base + voff, - buf + boff, clen); - if (clen < br->br_iov[i].iov_len - voff) - voff += clen; - else { - i++; - voff = 0; - } - boff += clen; - } while (boff < len); - off += len; - br->br_resid -= len; - } - break; - case BOP_WRITE: - if (bc->bc_rdonly) { - err = EROFS; - break; - } - if (buf == NULL) { - if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, - br->br_offset)) < 0) - err = errno; - else - br->br_resid -= len; - break; - } - i = 0; - off = voff = 0; - while (br->br_resid > 0) { - len = MIN(br->br_resid, MAXPHYS); - boff = 0; - do { - clen = MIN(len - boff, br->br_iov[i].iov_len - - voff); - memcpy(buf + boff, - br->br_iov[i].iov_base + voff, clen); - if (clen < br->br_iov[i].iov_len - voff) - voff += clen; - else { - i++; - voff = 0; - } - boff += clen; - } while (boff < len); - if (pwrite(bc->bc_fd, buf, len, br->br_offset + - off) < 0) { - err = errno; - break; - } - off += len; - br->br_resid -= len; - } - break; - case BOP_FLUSH: - if (bc->bc_ischr) { - if (ioctl(bc->bc_fd, DIOCGFLUSH)) - err = errno; - } else if (fsync(bc->bc_fd)) - err = errno; - break; - case BOP_DELETE: - if (!bc->bc_candelete) - err = EOPNOTSUPP; - else if (bc->bc_rdonly) - err = EROFS; - else if (bc->bc_ischr) { - arg[0] = br->br_offset; - arg[1] = br->br_resid; - if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) - err = errno; - else - br->br_resid = 0; - } else - err = EOPNOTSUPP; - break; - default: - err = EINVAL; - break; - } - - be->be_status = BST_DONE; - - (*br->br_callback)(br, err); -} - -static void * -blockif_thr(void *arg) -{ - struct blockif_ctxt *bc; - struct blockif_elem *be; - pthread_t t; - uint8_t *buf; - - bc = arg; - if (bc->bc_isgeom) - buf = malloc(MAXPHYS); - else - buf = NULL; - t = pthread_self(); - - pthread_mutex_lock(&bc->bc_mtx); - for (;;) { - while (blockif_dequeue(bc, t, &be)) { - pthread_mutex_unlock(&bc->bc_mtx); - blockif_proc(bc, be, buf); - pthread_mutex_lock(&bc->bc_mtx); - blockif_complete(bc, be); - } - /* Check ctxt status here to see if exit requested */ - if (bc->bc_closing) - break; - pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); - } - pthread_mutex_unlock(&bc->bc_mtx); - - if (buf) - free(buf); - pthread_exit(NULL); - return (NULL); -} - -static void -blockif_sigcont_handler(int signal, enum ev_type type, void *arg) -{ - struct blockif_sig_elem *bse; - - for (;;) { - /* - * Process the entire list even if not intended for - * this thread. - */ - do { - bse = blockif_bse_head; - if (bse == NULL) - return; - } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, - (uintptr_t)bse, - (uintptr_t)bse->bse_next)); - - pthread_mutex_lock(&bse->bse_mtx); - bse->bse_pending = 0; - pthread_cond_signal(&bse->bse_cond); - pthread_mutex_unlock(&bse->bse_mtx); - } -} - -static void -blockif_init(void) -{ - mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); - (void) signal(SIGCONT, SIG_IGN); -} - -struct blockif_ctxt * -blockif_open(const char *optstr, const char *ident) -{ - char tname[MAXCOMLEN + 1]; - char name[MAXPATHLEN]; - char *nopt, *xopts, *cp; - struct blockif_ctxt *bc; - struct stat sbuf; - struct diocgattr_arg arg; - off_t size, psectsz, psectoff; - int extra, fd, i, sectsz; - int nocache, sync, ro, candelete, geom, ssopt, pssopt; -#ifndef WITHOUT_CAPSICUM - cap_rights_t rights; - cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; -#endif - - pthread_once(&blockif_once, blockif_init); - - fd = -1; - ssopt = 0; - nocache = 0; - sync = 0; - ro = 0; - /* - * The first element in the optstring is always a pathname. - * Optional elements follow + * Find the block device backend that matches the user-provided + * device name. block_backend_set is built using a linker set. */ - nopt = xopts = strdup(optstr); - while (xopts != NULL) { - cp = strsep(&xopts, ","); - if (cp == nopt) /* file or device pathname */ - continue; - else if (!strcmp(cp, "nocache")) - nocache = 1; - else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) - sync = 1; - else if (!strcmp(cp, "ro")) - ro = 1; - else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) - ; - else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) - pssopt = ssopt; - else { - EPRINTLN("Invalid device option \"%s\"", cp); - goto err; - } - } - - extra = 0; - if (nocache) - extra |= O_DIRECT; - if (sync) - extra |= O_SYNC; - - fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); - if (fd < 0 && !ro) { - /* Attempt a r/w fail with a r/o open */ - fd = open(nopt, O_RDONLY | extra); - ro = 1; - } - - if (fd < 0) { - warn("Could not open backing file: %s", nopt); - goto err; - } - - if (fstat(fd, &sbuf) < 0) { - warn("Could not stat backing file %s", nopt); - goto err; - } - -#ifndef WITHOUT_CAPSICUM - cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, - CAP_WRITE); - if (ro) - cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); - - if (caph_rights_limit(fd, &rights) == -1) - errx(EX_OSERR, "Unable to apply rights for sandbox"); -#endif - - /* - * Deal with raw devices - */ - size = sbuf.st_size; - sectsz = DEV_BSIZE; - psectsz = psectoff = 0; - candelete = geom = 0; - if (S_ISCHR(sbuf.st_mode)) { - if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || - ioctl(fd, DIOCGSECTORSIZE, §sz)) { - perror("Could not fetch dev blk/sector size"); - goto err; - } - assert(size != 0); - assert(sectsz != 0); - if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) - ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); - strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); - arg.len = sizeof(arg.value.i); - if (ioctl(fd, DIOCGATTR, &arg) == 0) - candelete = arg.value.i; - if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) - geom = 1; - } else - psectsz = sbuf.st_blksize; - -#ifndef WITHOUT_CAPSICUM - if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) - errx(EX_OSERR, "Unable to apply rights for sandbox"); -#endif - - if (ssopt != 0) { - if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || - ssopt > pssopt) { - EPRINTLN("Invalid sector size %d/%d", - ssopt, pssopt); - goto err; - } - + SET_FOREACH(bbe, block_backend_set) { /* - * Some backend drivers (e.g. cd0, ada0) require that the I/O - * size be a multiple of the device's sector size. - * - * Validate that the emulated sector size complies with this - * requirement. + * We match the backend_name against the names of the + * backends until we have a match */ - if (S_ISCHR(sbuf.st_mode)) { - if (ssopt < sectsz || (ssopt % sectsz) != 0) { - EPRINTLN("Sector size %d incompatible " - "with underlying device sector size %d", - ssopt, sectsz); - goto err; + /* + * Local access has a pattern like: + * 3:0,virtio-blk,file/somewhere/guest.img + * 3:0,virtio-blk,/dev/xxxx + * Or new style: + * 3:0,virtio-blk,file:file/somewhere/guest.img + * 3:0,virtio-blk,file:/dev/xxxxx + * For local filesystem references in optstr shall exist. + * This is handled by the blk-local backend. + * If this does not match then other backends in the block_backend_set have + * their bb_open() called. The first one returning a non-NULL backend pointer + * is a match and is used with the specification in optstr + */ + if (strcmp(backend_name, (*bbe)->bb_name) == 0) { + ret = (*bbe)->bb_open(optrest, ident); + /* fill in the backend that is used to open this request */ + if (ret != NULL) { + ret->be = *bbe; + return (ret); } + break; } - - sectsz = ssopt; - psectsz = pssopt; - psectoff = 0; + } + /* + * We did not find any matching block backend drivers to 'optstr' + * Can we dynamically load a driver? + */ + char dlpath[MAXPATHLEN] = "/usr/lib/libblock_"; + char dlbb[MAXPATHLEN] = "block_backend_"; + void *dlfd; - bc = calloc(1, sizeof(struct blockif_ctxt)); - if (bc == NULL) { - perror("calloc"); - goto err; + strcat(dlpath, backend_name); + strcat(dlpath, ".so"); + if (( dlfd = dlopen(dlpath, RTLD_NOW)) == NULL) { + /* report error and exit */ + printf("dlopen error for %s: %s. \n", dlpath, dlerror()); + return (NULL); } - - bc->bc_magic = BLOCKIF_SIG; - bc->bc_fd = fd; - bc->bc_ischr = S_ISCHR(sbuf.st_mode); - bc->bc_isgeom = geom; - bc->bc_candelete = candelete; - bc->bc_rdonly = ro; - bc->bc_size = size; - bc->bc_sectsz = sectsz; - bc->bc_psectsz = psectsz; - bc->bc_psectoff = psectoff; - pthread_mutex_init(&bc->bc_mtx, NULL); - pthread_cond_init(&bc->bc_cond, NULL); - TAILQ_INIT(&bc->bc_freeq); - TAILQ_INIT(&bc->bc_pendq); - TAILQ_INIT(&bc->bc_busyq); - for (i = 0; i < BLOCKIF_MAXREQ; i++) { - bc->bc_reqs[i].be_status = BST_FREE; - TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); + strcat(dlbb, backend_name); + struct block_backend *dynbb = (struct block_backend*)dlsym( dlfd, dlbb); + if (dynbb == NULL) { + /* report error and exit */ + printf("dlsym error for %s: %s. \n", dlbb, dlerror()); + return (NULL); } - for (i = 0; i < BLOCKIF_NUMTHR; i++) { - pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); - snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); - pthread_set_name_np(bc->bc_btid[i], tname); + /* now execute the open of the new found backend */ + // DATA_SET(block_backend_set, dynbb); + ret = dynbb->bb_open(optrest, ident); + /* fill in the backend that is used to open this request */ + if (ret != NULL) { + ret->be = dynbb; + return (ret); } - - return (bc); -err: - if (fd >= 0) - close(fd); - free(nopt); return (NULL); } -static int -blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, - enum blockop op) -{ - int err; - - err = 0; - - pthread_mutex_lock(&bc->bc_mtx); - if (!TAILQ_EMPTY(&bc->bc_freeq)) { - /* - * Enqueue and inform the block i/o thread - * that there is work available - */ - if (blockif_enqueue(bc, breq, op)) - pthread_cond_signal(&bc->bc_cond); - } else { - /* - * Callers are not allowed to enqueue more than - * the specified blockif queue limit. Return an - * error to indicate that the queue length has been - * exceeded. - */ - err = E2BIG; - } - pthread_mutex_unlock(&bc->bc_mtx); - - return (err); -} - int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (blockif_request(bc, breq, BOP_READ)); + return ((bc->be)->bb_read(bc, breq)); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (blockif_request(bc, breq, BOP_WRITE)); + return ((bc->be)->bb_write(bc, breq)); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (blockif_request(bc, breq, BOP_FLUSH)); + return ((bc->be)->bb_flush(bc, breq)); } int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (blockif_request(bc, breq, BOP_DELETE)); + return ((bc->be)->bb_delete(bc, breq)); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { - struct blockif_elem *be; - - assert(bc->bc_magic == BLOCKIF_SIG); - - pthread_mutex_lock(&bc->bc_mtx); - /* - * Check pending requests. - */ - TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { - if (be->be_req == breq) - break; - } - if (be != NULL) { - /* - * Found it. - */ - blockif_complete(bc, be); - pthread_mutex_unlock(&bc->bc_mtx); - - return (0); - } - - /* - * Check in-flight requests. - */ - TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { - if (be->be_req == breq) - break; - } - if (be == NULL) { - /* - * Didn't find it. - */ - pthread_mutex_unlock(&bc->bc_mtx); - return (EINVAL); - } - - /* - * Interrupt the processing thread to force it return - * prematurely via it's normal callback path. - */ - while (be->be_status == BST_BUSY) { - struct blockif_sig_elem bse, *old_head; - - pthread_mutex_init(&bse.bse_mtx, NULL); - pthread_cond_init(&bse.bse_cond, NULL); - - bse.bse_pending = 1; - - do { - old_head = blockif_bse_head; - bse.bse_next = old_head; - } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, - (uintptr_t)old_head, - (uintptr_t)&bse)); - - pthread_kill(be->be_tid, SIGCONT); - - pthread_mutex_lock(&bse.bse_mtx); - while (bse.bse_pending) - pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); - pthread_mutex_unlock(&bse.bse_mtx); - } - - pthread_mutex_unlock(&bc->bc_mtx); - - /* - * The processing thread has been interrupted. Since it's not - * clear if the callback has been invoked yet, return EBUSY. - */ - return (EBUSY); + return ((bc->be)->bb_cancel(bc, breq)); } int blockif_close(struct blockif_ctxt *bc) { - void *jval; - int i; - - assert(bc->bc_magic == BLOCKIF_SIG); - - /* - * Stop the block i/o thread - */ - pthread_mutex_lock(&bc->bc_mtx); - bc->bc_closing = 1; - pthread_mutex_unlock(&bc->bc_mtx); - pthread_cond_broadcast(&bc->bc_cond); - for (i = 0; i < BLOCKIF_NUMTHR; i++) - pthread_join(bc->bc_btid[i], &jval); - - /* XXX Cancel queued i/o's ??? */ - - /* - * Release resources - */ - bc->bc_magic = 0; - close(bc->bc_fd); - free(bc); - - return (0); + return ((bc->be)->bb_close(bc)); } -/* - * Return virtual C/H/S values for a given block. Use the algorithm - * outlined in the VHD specification to calculate values. - */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { - off_t sectors; /* total sectors of the block dev */ - off_t hcyl; /* cylinders times heads */ - uint16_t secpt; /* sectors per track */ - uint8_t heads; - - assert(bc->bc_magic == BLOCKIF_SIG); - - sectors = bc->bc_size / bc->bc_sectsz; - - /* Clamp the size to the largest possible with CHS */ - if (sectors > 65535UL*16*255) - sectors = 65535UL*16*255; - - if (sectors >= 65536UL*16*63) { - secpt = 255; - heads = 16; - hcyl = sectors / secpt; - } else { - secpt = 17; - hcyl = sectors / secpt; - heads = (hcyl + 1023) / 1024; - - if (heads < 4) - heads = 4; - - if (hcyl >= (heads * 1024) || heads > 16) { - secpt = 31; - heads = 16; - hcyl = sectors / secpt; - } - if (hcyl >= (heads * 1024)) { - secpt = 63; - heads = 16; - hcyl = sectors / secpt; - } - } - - *c = hcyl / heads; - *h = heads; - *s = secpt; + (bc->be)->bb_chs(bc, c, h, s); } -/* - * Accessors - */ off_t blockif_size(struct blockif_ctxt *bc) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (bc->bc_size); + return ((bc->be)->bb_size(bc)); } int blockif_sectsz(struct blockif_ctxt *bc) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (bc->bc_sectsz); + return ((bc->be)->bb_sectsz(bc)); } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { - - assert(bc->bc_magic == BLOCKIF_SIG); - *size = bc->bc_psectsz; - *off = bc->bc_psectoff; + (bc->be)->bb_psectsz(bc, size, off); } int blockif_queuesz(struct blockif_ctxt *bc) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (BLOCKIF_MAXREQ - 1); + return ((bc->be)->bb_queuesz(bc)); } int blockif_is_ro(struct blockif_ctxt *bc) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (bc->bc_rdonly); + return ((bc->be)->bb_is_ro(bc)); } int blockif_candelete(struct blockif_ctxt *bc) { - - assert(bc->bc_magic == BLOCKIF_SIG); - return (bc->bc_candelete); + return ((bc->be)->bb_candelete(bc)); } Index: usr.sbin/bhyve/block_local.c =================================================================== --- usr.sbin/bhyve/block_local.c +++ usr.sbin/bhyve/block_local.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2013 Peter Grehan + * Copyright (c) 2020 Willem Jan Withagen * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,17 +25,16 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/block_if.c 356523 2020-01-08 22:55:22Z vmaffione $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 356523 2020-01-08 22:55:22Z vmaffione $"); +__FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif -#include #include #include #include @@ -63,71 +62,21 @@ #include "mevent.h" #include "block_if.h" -#define BLOCKIF_SIG 0xb109b109 +#include -#define BLOCKIF_NUMTHR 8 -#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) +static pthread_once_t blocklocal_once = PTHREAD_ONCE_INIT; -enum blockop { - BOP_READ, - BOP_WRITE, - BOP_FLUSH, - BOP_DELETE -}; - -enum blockstat { - BST_FREE, - BST_BLOCK, - BST_PEND, - BST_BUSY, - BST_DONE -}; - -struct blockif_elem { - TAILQ_ENTRY(blockif_elem) be_link; - struct blockif_req *be_req; - enum blockop be_op; - enum blockstat be_status; - pthread_t be_tid; - off_t be_block; -}; - -struct blockif_ctxt { - int bc_magic; - int bc_fd; - int bc_ischr; - int bc_isgeom; - int bc_candelete; - int bc_rdonly; - off_t bc_size; - int bc_sectsz; - int bc_psectsz; - int bc_psectoff; - int bc_closing; - pthread_t bc_btid[BLOCKIF_NUMTHR]; - pthread_mutex_t bc_mtx; - pthread_cond_t bc_cond; - - /* Request elements and free/pending/busy queues */ - TAILQ_HEAD(, blockif_elem) bc_freeq; - TAILQ_HEAD(, blockif_elem) bc_pendq; - TAILQ_HEAD(, blockif_elem) bc_busyq; - struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; -}; - -static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; - -struct blockif_sig_elem { +struct blocklocal_sig_elem { pthread_mutex_t bse_mtx; pthread_cond_t bse_cond; int bse_pending; - struct blockif_sig_elem *bse_next; + struct blocklocal_sig_elem *bse_next; }; -static struct blockif_sig_elem *blockif_bse_head; +static struct blocklocal_sig_elem *blocklocal_bse_head; static int -blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, +blocklocal_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be, *tbe; @@ -171,7 +120,7 @@ } static int -blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) +blocklocal_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; @@ -191,7 +140,7 @@ } static void -blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) +blocklocal_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { struct blockif_elem *tbe; @@ -210,7 +159,7 @@ } static void -blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) +blocklocal_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct blockif_req *br; off_t arg[2]; @@ -224,7 +173,7 @@ switch (be->be_op) { case BOP_READ: if (buf == NULL) { - if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + if ((len = preadv((int)bc->bc_desc, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else @@ -235,7 +184,7 @@ off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); - if (pread(bc->bc_fd, buf, len, br->br_offset + + if (pread((int)bc->bc_desc, buf, len, br->br_offset + off) < 0) { err = errno; break; @@ -264,7 +213,7 @@ break; } if (buf == NULL) { - if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + if ((len = pwritev((int)bc->bc_desc, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else @@ -289,7 +238,7 @@ } boff += clen; } while (boff < len); - if (pwrite(bc->bc_fd, buf, len, br->br_offset + + if (pwrite((int)bc->bc_desc, buf, len, br->br_offset + off) < 0) { err = errno; break; @@ -300,9 +249,9 @@ break; case BOP_FLUSH: if (bc->bc_ischr) { - if (ioctl(bc->bc_fd, DIOCGFLUSH)) + if (ioctl((int)bc->bc_desc, DIOCGFLUSH)) err = errno; - } else if (fsync(bc->bc_fd)) + } else if (fsync((int)bc->bc_desc)) err = errno; break; case BOP_DELETE: @@ -313,7 +262,7 @@ else if (bc->bc_ischr) { arg[0] = br->br_offset; arg[1] = br->br_resid; - if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) + if (ioctl((int)bc->bc_desc, DIOCGDELETE, arg)) err = errno; else br->br_resid = 0; @@ -331,7 +280,7 @@ } static void * -blockif_thr(void *arg) +blocklocal_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; @@ -347,11 +296,11 @@ pthread_mutex_lock(&bc->bc_mtx); for (;;) { - while (blockif_dequeue(bc, t, &be)) { + while (blocklocal_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); - blockif_proc(bc, be, buf); + blocklocal_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); - blockif_complete(bc, be); + blocklocal_complete(bc, be); } /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) @@ -367,9 +316,9 @@ } static void -blockif_sigcont_handler(int signal, enum ev_type type, void *arg) +blocklocal_sigcont_handler(int signal, enum ev_type type, void *arg) { - struct blockif_sig_elem *bse; + struct blocklocal_sig_elem *bse; for (;;) { /* @@ -377,10 +326,10 @@ * this thread. */ do { - bse = blockif_bse_head; + bse = blocklocal_bse_head; if (bse == NULL) return; - } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + } while (!atomic_cmpset_ptr((uintptr_t *)&blocklocal_bse_head, (uintptr_t)bse, (uintptr_t)bse->bse_next)); @@ -391,15 +340,22 @@ } } -static void -blockif_init(void) +void +blocklocal_init(void) { - mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); + mevent_add(SIGCONT, EVF_SIGNAL, blocklocal_sigcont_handler, NULL); (void) signal(SIGCONT, SIG_IGN); } -struct blockif_ctxt * -blockif_open(const char *optstr, const char *ident) +void +blocklocal_cleanup(struct blockif_ctxt *bc) +{ /* empty block + * currently no cleanup required. + */ +} + +static struct blockif_ctxt * +blocklocal_open(const char *optstr, const char *ident) { char tname[MAXCOMLEN + 1]; char name[MAXPATHLEN]; @@ -415,7 +371,7 @@ cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; #endif - pthread_once(&blockif_once, blockif_init); + pthread_once(&blocklocal_once, blocklocal_init); fd = -1; ssopt = 0; @@ -466,10 +422,10 @@ goto err; } - if (fstat(fd, &sbuf) < 0) { + if (fstat(fd, &sbuf) < 0) { warn("Could not stat backing file %s", nopt); goto err; - } + } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, @@ -481,10 +437,10 @@ errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif - /* + /* * Deal with raw devices */ - size = sbuf.st_size; + size = sbuf.st_size; sectsz = DEV_BSIZE; psectsz = psectoff = 0; candelete = geom = 0; @@ -548,7 +504,7 @@ } bc->bc_magic = BLOCKIF_SIG; - bc->bc_fd = fd; + bc->bc_desc = fd; bc->bc_ischr = S_ISCHR(sbuf.st_mode); bc->bc_isgeom = geom; bc->bc_candelete = candelete; @@ -568,7 +524,7 @@ } for (i = 0; i < BLOCKIF_NUMTHR; i++) { - pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); + pthread_create(&bc->bc_btid[i], NULL, blocklocal_thr, bc); snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); pthread_set_name_np(bc->bc_btid[i], tname); } @@ -595,7 +551,7 @@ * Enqueue and inform the block i/o thread * that there is work available */ - if (blockif_enqueue(bc, breq, op)) + if (blocklocal_enqueue(bc, breq, op)) pthread_cond_signal(&bc->bc_cond); } else { /* @@ -611,40 +567,40 @@ return (err); } -int -blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) +static int +blocklocal_read(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_READ)); } -int -blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) +static int +blocklocal_write(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_WRITE)); } -int -blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) +static int +blocklocal_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_FLUSH)); } -int -blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) +static int +blocklocal_delete(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_DELETE)); } -int -blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) +static int +blocklocal_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; @@ -662,7 +618,7 @@ /* * Found it. */ - blockif_complete(bc, be); + blocklocal_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); @@ -688,7 +644,7 @@ * prematurely via it's normal callback path. */ while (be->be_status == BST_BUSY) { - struct blockif_sig_elem bse, *old_head; + struct blocklocal_sig_elem bse, *old_head; pthread_mutex_init(&bse.bse_mtx, NULL); pthread_cond_init(&bse.bse_cond, NULL); @@ -696,9 +652,9 @@ bse.bse_pending = 1; do { - old_head = blockif_bse_head; + old_head = blocklocal_bse_head; bse.bse_next = old_head; - } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + } while (!atomic_cmpset_ptr((uintptr_t *)&blocklocal_bse_head, (uintptr_t)old_head, (uintptr_t)&bse)); @@ -719,8 +675,8 @@ return (EBUSY); } -int -blockif_close(struct blockif_ctxt *bc) +static int +blocklocal_close(struct blockif_ctxt *bc) { void *jval; int i; @@ -743,7 +699,7 @@ * Release resources */ bc->bc_magic = 0; - close(bc->bc_fd); + close((int)bc->bc_desc); free(bc); return (0); @@ -753,8 +709,8 @@ * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ -void -blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +static void +blocklocal_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ @@ -801,24 +757,24 @@ /* * Accessors */ -off_t -blockif_size(struct blockif_ctxt *bc) +static off_t +blocklocal_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_size); } -int -blockif_sectsz(struct blockif_ctxt *bc) +static int +blocklocal_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_sectsz); } -void -blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) +static void +blocklocal_psectsz(struct blockif_ctxt *bc, int *size, int *off) { assert(bc->bc_magic == BLOCKIF_SIG); @@ -826,26 +782,47 @@ *off = bc->bc_psectoff; } -int -blockif_queuesz(struct blockif_ctxt *bc) +static int +blocklocal_queuesz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (BLOCKIF_MAXREQ - 1); } -int -blockif_is_ro(struct blockif_ctxt *bc) +static int +blocklocal_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } -int -blockif_candelete(struct blockif_ctxt *bc) +static int +blocklocal_candelete(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } + +struct block_backend blocklocal_backend = { + .bb_name = "file", + .bb_init = blocklocal_init, + .bb_cleanup = blocklocal_cleanup, + .bb_open = blocklocal_open, + .bb_size = blocklocal_size, + .bb_chs = blocklocal_chs, + .bb_sectsz = blocklocal_sectsz, + .bb_psectsz = blocklocal_psectsz, + .bb_queuesz = blocklocal_queuesz, + .bb_is_ro = blocklocal_is_ro, + .bb_candelete = blocklocal_candelete, + .bb_read = blocklocal_read, + .bb_write = blocklocal_write, + .bb_flush = blocklocal_flush, + .bb_delete = blocklocal_delete, + .bb_cancel = blocklocal_cancel, + .bb_close = blocklocal_close, +}; +DATA_SET(block_backend_set, blocklocal_backend);