Index: head/sys/cam/ctl/ctl_backend_block.c =================================================================== --- head/sys/cam/ctl/ctl_backend_block.c (revision 356199) +++ head/sys/cam/ctl/ctl_backend_block.c (revision 356200) @@ -1,2901 +1,2889 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 Silicon Graphics International Corp. * Copyright (c) 2009-2011 Spectra Logic Corporation * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2014-2015 Alexander Motin * All rights reserved. * * Portions of this software were developed by Edward Tomasz Napierala * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $ */ /* * CAM Target Layer driver backend for block devices. * * Author: Ken Merry */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The idea here is that we'll allocate enough S/G space to hold a 1MB * I/O. If we get an I/O larger than that, we'll split it. */ #define CTLBLK_HALF_IO_SIZE (512 * 1024) #define CTLBLK_MAX_IO_SIZE (CTLBLK_HALF_IO_SIZE * 2) #define CTLBLK_MAX_SEG MAXPHYS #define CTLBLK_HALF_SEGS MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1) #define CTLBLK_MAX_SEGS (CTLBLK_HALF_SEGS * 2) #ifdef CTLBLK_DEBUG #define DPRINTF(fmt, args...) \ printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTF(fmt, args...) do {} while(0) #endif #define PRIV(io) \ ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND]) #define ARGS(io) \ ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]) SDT_PROVIDER_DEFINE(cbb); typedef enum { CTL_BE_BLOCK_LUN_UNCONFIGURED = 0x01, CTL_BE_BLOCK_LUN_CONFIG_ERR = 0x02, CTL_BE_BLOCK_LUN_WAITING = 0x04, } ctl_be_block_lun_flags; typedef enum { CTL_BE_BLOCK_NONE, CTL_BE_BLOCK_DEV, CTL_BE_BLOCK_FILE } ctl_be_block_type; struct ctl_be_block_filedata { struct ucred *cred; }; union ctl_be_block_bedata { struct ctl_be_block_filedata file; }; struct ctl_be_block_io; struct ctl_be_block_lun; typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun, const char *attrname); /* * Backend LUN structure. There is a 1:1 mapping between a block device * and a backend block LUN, and between a backend block LUN and a CTL LUN. */ struct ctl_be_block_lun { struct ctl_lun_create_params params; char lunname[32]; char *dev_path; ctl_be_block_type dev_type; struct vnode *vn; union ctl_be_block_bedata backend; cbb_dispatch_t dispatch; cbb_dispatch_t lun_flush; cbb_dispatch_t unmap; cbb_dispatch_t get_lba_status; cbb_getattr_t getattr; uma_zone_t lun_zone; uint64_t size_blocks; uint64_t size_bytes; struct ctl_be_block_softc *softc; struct devstat *disk_stats; ctl_be_block_lun_flags flags; STAILQ_ENTRY(ctl_be_block_lun) links; struct ctl_be_lun cbe_lun; struct taskqueue *io_taskqueue; struct task io_task; int num_threads; STAILQ_HEAD(, ctl_io_hdr) input_queue; STAILQ_HEAD(, ctl_io_hdr) config_read_queue; STAILQ_HEAD(, ctl_io_hdr) config_write_queue; STAILQ_HEAD(, ctl_io_hdr) datamove_queue; struct mtx_padalign io_lock; struct mtx_padalign queue_lock; }; /* * Overall softc structure for the block backend module. */ struct ctl_be_block_softc { struct mtx lock; uma_zone_t beio_zone; int num_luns; STAILQ_HEAD(, ctl_be_block_lun) lun_list; }; static struct ctl_be_block_softc backend_block_softc; /* * Per-I/O information. */ struct ctl_be_block_io { union ctl_io *io; struct ctl_sg_entry sg_segs[CTLBLK_MAX_SEGS]; struct iovec xiovecs[CTLBLK_MAX_SEGS]; int bio_cmd; int num_segs; int num_bios_sent; int num_bios_done; int send_complete; int first_error; uint64_t first_error_offset; struct bintime ds_t0; devstat_tag_type ds_tag_type; devstat_trans_flags ds_trans_type; uint64_t io_len; uint64_t io_offset; int io_arg; struct ctl_be_block_softc *softc; struct ctl_be_block_lun *lun; void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */ }; extern struct ctl_softc *control_softc; static int cbb_num_threads = 14; SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0, "CAM Target Layer Block Backend"); SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN, &cbb_num_threads, 0, "Number of threads per backing file"); static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc); static void ctl_free_beio(struct ctl_be_block_io *beio); static void ctl_complete_beio(struct ctl_be_block_io *beio); static int ctl_be_block_move_done(union ctl_io *io); static void ctl_be_block_biodone(struct bio *bio); static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname); static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname); static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_worker(void *context, int pending); static int ctl_be_block_submit(union ctl_io *io); static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td); static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_close(struct ctl_be_block_lun *be_lun); static int ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static int ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static int ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static void ctl_be_block_lun_shutdown(void *be_lun); static void ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status); static int ctl_be_block_config_write(union ctl_io *io); static int ctl_be_block_config_read(union ctl_io *io); static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb); static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname); static int ctl_be_block_init(void); static int ctl_be_block_shutdown(void); static struct ctl_backend_driver ctl_be_block_driver = { .name = "block", .flags = CTL_BE_FLAG_HAS_CONFIG, .init = ctl_be_block_init, .shutdown = ctl_be_block_shutdown, .data_submit = ctl_be_block_submit, .data_move_done = ctl_be_block_move_done, .config_read = ctl_be_block_config_read, .config_write = ctl_be_block_config_write, .ioctl = ctl_be_block_ioctl, .lun_info = ctl_be_block_lun_info, .lun_attr = ctl_be_block_lun_attr }; MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend"); CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver); static struct ctl_be_block_io * ctl_alloc_beio(struct ctl_be_block_softc *softc) { struct ctl_be_block_io *beio; beio = uma_zalloc(softc->beio_zone, M_WAITOK | M_ZERO); beio->softc = softc; return (beio); } static void ctl_free_beio(struct ctl_be_block_io *beio) { int duplicate_free; int i; duplicate_free = 0; for (i = 0; i < beio->num_segs; i++) { if (beio->sg_segs[i].addr == NULL) duplicate_free++; uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr); beio->sg_segs[i].addr = NULL; /* For compare we had two equal S/G lists. */ if (ARGS(beio->io)->flags & CTL_LLF_COMPARE) { uma_zfree(beio->lun->lun_zone, beio->sg_segs[i + CTLBLK_HALF_SEGS].addr); beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = NULL; } } if (duplicate_free > 0) { printf("%s: %d duplicate frees out of %d segments\n", __func__, duplicate_free, beio->num_segs); } uma_zfree(beio->softc->beio_zone, beio); } static void ctl_complete_beio(struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; if (beio->beio_cont != NULL) { beio->beio_cont(beio); } else { ctl_free_beio(beio); ctl_data_submit_done(io); } } static size_t cmp(uint8_t *a, uint8_t *b, size_t size) { size_t i; for (i = 0; i < size; i++) { if (a[i] != b[i]) break; } return (i); } static void ctl_be_block_compare(union ctl_io *io) { struct ctl_be_block_io *beio; uint64_t off, res; int i; uint8_t info[8]; beio = (struct ctl_be_block_io *)PRIV(io)->ptr; off = 0; for (i = 0; i < beio->num_segs; i++) { res = cmp(beio->sg_segs[i].addr, beio->sg_segs[i + CTLBLK_HALF_SEGS].addr, beio->sg_segs[i].len); off += res; if (res < beio->sg_segs[i].len) break; } if (i < beio->num_segs) { scsi_u64to8b(off, info); ctl_set_sense(&io->scsiio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_MISCOMPARE, /*asc*/ 0x1D, /*ascq*/ 0x00, /*type*/ SSD_ELEM_INFO, /*size*/ sizeof(info), /*data*/ &info, /*type*/ SSD_ELEM_NONE); } else ctl_set_success(&io->scsiio); } static int ctl_be_block_move_done(union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_lun *be_lun; struct ctl_lba_len_flags *lbalen; #ifdef CTL_TIME_IO struct bintime cur_bt; #endif beio = (struct ctl_be_block_io *)PRIV(io)->ptr; be_lun = beio->lun; DPRINTF("entered\n"); #ifdef CTL_TIME_IO getbinuptime(&cur_bt); bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt); bintime_add(&io->io_hdr.dma_bt, &cur_bt); #endif io->io_hdr.num_dmas++; io->scsiio.kern_rel_offset += io->scsiio.kern_data_len; /* * We set status at this point for read commands, and write * commands with errors. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { ; } else if ((io->io_hdr.port_status != 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ io->io_hdr.port_status); } else if (io->scsiio.kern_data_resid != 0 && (io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { ctl_set_invalid_field_ciu(&io->scsiio); } else if ((io->io_hdr.port_status == 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) { lbalen = ARGS(beio->io); if (lbalen->flags & CTL_LLF_READ) { ctl_set_success(&io->scsiio); } else if (lbalen->flags & CTL_LLF_COMPARE) { /* We have two data blocks ready for comparison. */ ctl_be_block_compare(io); } } /* * If this is a read, or a write with errors, it is done. */ if ((beio->bio_cmd == BIO_READ) || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) { ctl_complete_beio(beio); return (0); } /* * At this point, we have a write and the DMA completed * successfully. We now have to queue it to the task queue to * execute the backend I/O. That is because we do blocking * memory allocations, and in the file backing case, blocking I/O. * This move done routine is generally called in the SIM's * interrupt context, and therefore we cannot block. */ mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); return (0); } static void ctl_be_block_biodone(struct bio *bio) { struct ctl_be_block_io *beio; struct ctl_be_block_lun *be_lun; union ctl_io *io; int error; beio = bio->bio_caller1; be_lun = beio->lun; io = beio->io; DPRINTF("entered\n"); error = bio->bio_error; mtx_lock(&be_lun->io_lock); if (error != 0 && (beio->first_error == 0 || bio->bio_offset < beio->first_error_offset)) { beio->first_error = error; beio->first_error_offset = bio->bio_offset; } beio->num_bios_done++; /* * XXX KDM will this cause WITNESS to complain? Holding a lock * during the free might cause it to complain. */ g_destroy_bio(bio); /* * If the send complete bit isn't set, or we aren't the last I/O to * complete, then we're done. */ if ((beio->send_complete == 0) || (beio->num_bios_done < beio->num_bios_sent)) { mtx_unlock(&be_lun->io_lock); return; } /* * At this point, we've verified that we are the last I/O to * complete, so it's safe to drop the lock. */ devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If there are any errors from the backing device, we fail the * entire I/O with a medium error. */ error = beio->first_error; if (error != 0) { if (error == EOPNOTSUPP) { ctl_set_invalid_opcode(&io->scsiio); } else if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else if (beio->bio_cmd == BIO_FLUSH) { /* XXX KDM is there is a better error here? */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ 0xbad2); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write, a flush, a delete or verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (beio->bio_cmd == BIO_FLUSH) || (beio->bio_cmd == BIO_DELETE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct mount *mountpoint; int error, lock_flags; DPRINTF("entered\n"); binuptime(&beio->ds_t0); - mtx_lock(&be_lun->io_lock); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); - mtx_unlock(&be_lun->io_lock); (void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT); if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_lock(be_lun->vn, lock_flags | LK_RETRY); error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT, curthread); VOP_UNLOCK(be_lun->vn, 0); vn_finished_write(mountpoint); mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); if (error == 0) ctl_set_success(&io->scsiio); else { /* XXX KDM is there is a better error here? */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ 0xbad1); } ctl_complete_beio(beio); } SDT_PROBE_DEFINE1(cbb, , read, file_start, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , write, file_start, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , read, file_done,"uint64_t"); SDT_PROBE_DEFINE1(cbb, , write, file_done, "uint64_t"); static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { struct ctl_be_block_filedata *file_data; union ctl_io *io; struct uio xuio; struct iovec *xiovec; size_t s; int error, flags, i; DPRINTF("entered\n"); file_data = &be_lun->backend.file; io = beio->io; flags = 0; if (ARGS(io)->flags & CTL_LLF_DPO) flags |= IO_DIRECT; if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA) flags |= IO_SYNC; bzero(&xuio, sizeof(xuio)); if (beio->bio_cmd == BIO_READ) { SDT_PROBE0(cbb, , read, file_start); xuio.uio_rw = UIO_READ; } else { SDT_PROBE0(cbb, , write, file_start); xuio.uio_rw = UIO_WRITE; } xuio.uio_offset = beio->io_offset; xuio.uio_resid = beio->io_len; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = beio->xiovecs; xuio.uio_iovcnt = beio->num_segs; xuio.uio_td = curthread; for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) { xiovec->iov_base = beio->sg_segs[i].addr; xiovec->iov_len = beio->sg_segs[i].len; } binuptime(&beio->ds_t0); - mtx_lock(&be_lun->io_lock); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); - mtx_unlock(&be_lun->io_lock); if (beio->bio_cmd == BIO_READ) { vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); /* * UFS pays attention to IO_DIRECT for reads. If the * DIRECTIO option is configured into the kernel, it calls * ffs_rawread(). But that only works for single-segment * uios with user space addresses. In our case, with a * kernel uio, it still reads into the buffer cache, but it * will just try to release the buffer from the cache later * on in ffs_read(). * * ZFS does not pay attention to IO_DIRECT for reads. * * UFS does not pay attention to IO_SYNC for reads. * * ZFS pays attention to IO_SYNC (which translates into the * Solaris define FRSYNC for zfs_read()) for reads. It * attempts to sync the file before reading. */ error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred); VOP_UNLOCK(be_lun->vn, 0); SDT_PROBE0(cbb, , read, file_done); if (error == 0 && xuio.uio_resid > 0) { /* * If we red less then requested (EOF), then * we should clean the rest of the buffer. */ s = beio->io_len - xuio.uio_resid; for (i = 0; i < beio->num_segs; i++) { if (s >= beio->sg_segs[i].len) { s -= beio->sg_segs[i].len; continue; } bzero((uint8_t *)beio->sg_segs[i].addr + s, beio->sg_segs[i].len - s); s = 0; } } } else { struct mount *mountpoint; int lock_flags; (void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT); if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_lock(be_lun->vn, lock_flags | LK_RETRY); /* * UFS pays attention to IO_DIRECT for writes. The write * is done asynchronously. (Normally the write would just * get put into cache. * * UFS pays attention to IO_SYNC for writes. It will * attempt to write the buffer out synchronously if that * flag is set. * * ZFS does not pay attention to IO_DIRECT for writes. * * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) * for writes. It will flush the transaction from the * cache before returning. */ error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred); VOP_UNLOCK(be_lun->vn, 0); vn_finished_write(mountpoint); SDT_PROBE0(cbb, , write, file_done); } mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If we got an error, set the sense data to "MEDIUM ERROR" and * return the I/O to the user. */ if (error != 0) { if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write or a verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct ctl_lba_len_flags *lbalen = ARGS(io); struct scsi_get_lba_status_data *data; off_t roff, off; int error, status; DPRINTF("entered\n"); off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize; vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off, 0, curthread->td_ucred, curthread); if (error == 0 && off > roff) status = 0; /* mapped up to off */ else { error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off, 0, curthread->td_ucred, curthread); if (error == 0 && off > roff) status = 1; /* deallocated up to off */ else { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; } } VOP_UNLOCK(be_lun->vn, 0); data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr; scsi_u64to8b(lbalen->lba, data->descr[0].addr); scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize - lbalen->lba), data->descr[0].length); data->descr[0].status = status; ctl_complete_beio(beio); } static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname) { struct vattr vattr; struct statfs statfs; uint64_t val; int error; val = UINT64_MAX; if (be_lun->vn == NULL) return (val); vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); if (strcmp(attrname, "blocksused") == 0) { error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred); if (error == 0) val = vattr.va_bytes / be_lun->cbe_lun.blocksize; } if (strcmp(attrname, "blocksavail") == 0 && !VN_IS_DOOMED(be_lun->vn)) { error = VFS_STATFS(be_lun->vn->v_mount, &statfs); if (error == 0) val = statfs.f_bavail * statfs.f_bsize / be_lun->cbe_lun.blocksize; } VOP_UNLOCK(be_lun->vn, 0); return (val); } static void ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io; struct cdevsw *csw; struct cdev *dev; struct uio xuio; struct iovec *xiovec; int error, flags, i, ref; DPRINTF("entered\n"); io = beio->io; flags = 0; if (ARGS(io)->flags & CTL_LLF_DPO) flags |= IO_DIRECT; if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA) flags |= IO_SYNC; bzero(&xuio, sizeof(xuio)); if (beio->bio_cmd == BIO_READ) { SDT_PROBE0(cbb, , read, file_start); xuio.uio_rw = UIO_READ; } else { SDT_PROBE0(cbb, , write, file_start); xuio.uio_rw = UIO_WRITE; } xuio.uio_offset = beio->io_offset; xuio.uio_resid = beio->io_len; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = beio->xiovecs; xuio.uio_iovcnt = beio->num_segs; xuio.uio_td = curthread; for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) { xiovec->iov_base = beio->sg_segs[i].addr; xiovec->iov_len = beio->sg_segs[i].len; } binuptime(&beio->ds_t0); - mtx_lock(&be_lun->io_lock); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); - mtx_unlock(&be_lun->io_lock); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw) { if (beio->bio_cmd == BIO_READ) error = csw->d_read(dev, &xuio, flags); else error = csw->d_write(dev, &xuio, flags); dev_relthread(dev, ref); } else error = ENXIO; if (beio->bio_cmd == BIO_READ) SDT_PROBE0(cbb, , read, file_done); else SDT_PROBE0(cbb, , write, file_done); mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If we got an error, set the sense data to "MEDIUM ERROR" and * return the I/O to the user. */ if (error != 0) { if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write or a verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct cdevsw *csw; struct cdev *dev; struct ctl_lba_len_flags *lbalen = ARGS(io); struct scsi_get_lba_status_data *data; off_t roff, off; int error, ref, status; DPRINTF("entered\n"); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; goto done; } off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize; error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD, curthread); if (error == 0 && off > roff) status = 0; /* mapped up to off */ else { error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD, curthread); if (error == 0 && off > roff) status = 1; /* deallocated up to off */ else { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; } } dev_relthread(dev, ref); done: data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr; scsi_u64to8b(lbalen->lba, data->descr[0].addr); scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize - lbalen->lba), data->descr[0].length); data->descr[0].status = status; ctl_complete_beio(beio); } static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { struct bio *bio; struct cdevsw *csw; struct cdev *dev; int ref; DPRINTF("entered\n"); /* This can't fail, it's a blocking allocation. */ bio = g_alloc_bio(); bio->bio_cmd = BIO_FLUSH; bio->bio_offset = 0; bio->bio_data = 0; bio->bio_done = ctl_be_block_biodone; bio->bio_caller1 = beio; bio->bio_pblkno = 0; /* * We don't need to acquire the LUN lock here, because we are only * sending one bio, and so there is no other context to synchronize * with. */ beio->num_bios_sent = 1; beio->send_complete = 1; binuptime(&beio->ds_t0); - mtx_lock(&be_lun->io_lock); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); - mtx_unlock(&be_lun->io_lock); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw) { bio->bio_dev = dev; csw->d_strategy(bio); dev_relthread(dev, ref); } else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } static void ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio, uint64_t off, uint64_t len, int last) { struct bio *bio; uint64_t maxlen; struct cdevsw *csw; struct cdev *dev; int ref; csw = devvn_refthread(be_lun->vn, &dev, &ref); maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize); while (len > 0) { bio = g_alloc_bio(); bio->bio_cmd = BIO_DELETE; bio->bio_dev = dev; bio->bio_offset = off; bio->bio_length = MIN(len, maxlen); bio->bio_data = 0; bio->bio_done = ctl_be_block_biodone; bio->bio_caller1 = beio; bio->bio_pblkno = off / be_lun->cbe_lun.blocksize; off += bio->bio_length; len -= bio->bio_length; mtx_lock(&be_lun->io_lock); beio->num_bios_sent++; if (last && len == 0) beio->send_complete = 1; mtx_unlock(&be_lun->io_lock); if (csw) { csw->d_strategy(bio); } else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } if (csw) dev_relthread(dev, ref); } static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io; struct ctl_ptr_len_flags *ptrlen; struct scsi_unmap_desc *buf, *end; uint64_t len; io = beio->io; DPRINTF("entered\n"); binuptime(&beio->ds_t0); - mtx_lock(&be_lun->io_lock); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); - mtx_unlock(&be_lun->io_lock); if (beio->io_offset == -1) { beio->io_len = 0; ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; buf = (struct scsi_unmap_desc *)ptrlen->ptr; end = buf + ptrlen->len / sizeof(*buf); for (; buf < end; buf++) { len = (uint64_t)scsi_4btoul(buf->length) * be_lun->cbe_lun.blocksize; beio->io_len += len; ctl_be_block_unmap_dev_range(be_lun, beio, scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize, len, (end - buf < 2) ? TRUE : FALSE); } } else ctl_be_block_unmap_dev_range(be_lun, beio, beio->io_offset, beio->io_len, TRUE); } static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct bio *bio; struct cdevsw *csw; struct cdev *dev; off_t cur_offset; int i, max_iosize, ref; DPRINTF("entered\n"); csw = devvn_refthread(be_lun->vn, &dev, &ref); /* * We have to limit our I/O size to the maximum supported by the * backend device. Hopefully it is MAXPHYS. If the driver doesn't * set it properly, use DFLTPHYS. */ if (csw) { max_iosize = dev->si_iosize_max; if (max_iosize < PAGE_SIZE) max_iosize = DFLTPHYS; } else max_iosize = DFLTPHYS; cur_offset = beio->io_offset; for (i = 0; i < beio->num_segs; i++) { size_t cur_size; uint8_t *cur_ptr; cur_size = beio->sg_segs[i].len; cur_ptr = beio->sg_segs[i].addr; while (cur_size > 0) { /* This can't fail, it's a blocking allocation. */ bio = g_alloc_bio(); KASSERT(bio != NULL, ("g_alloc_bio() failed!\n")); bio->bio_cmd = beio->bio_cmd; bio->bio_dev = dev; bio->bio_caller1 = beio; bio->bio_length = min(cur_size, max_iosize); bio->bio_offset = cur_offset; bio->bio_data = cur_ptr; bio->bio_done = ctl_be_block_biodone; bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize; cur_offset += bio->bio_length; cur_ptr += bio->bio_length; cur_size -= bio->bio_length; TAILQ_INSERT_TAIL(&queue, bio, bio_queue); beio->num_bios_sent++; } } + beio->send_complete = 1; binuptime(&beio->ds_t0); - mtx_lock(&be_lun->io_lock); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); - beio->send_complete = 1; - mtx_unlock(&be_lun->io_lock); /* * Fire off all allocated requests! */ while ((bio = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, bio, bio_queue); if (csw) csw->d_strategy(bio); else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } if (csw) dev_relthread(dev, ref); } static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname) { struct diocgattr_arg arg; struct cdevsw *csw; struct cdev *dev; int error, ref; csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) return (UINT64_MAX); strlcpy(arg.name, attrname, sizeof(arg.name)); arg.len = sizeof(arg.value.off); if (csw->d_ioctl) { error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD, curthread); } else error = ENODEV; dev_relthread(dev, ref); if (error != 0) return (UINT64_MAX); return (arg.value.off); } static void ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_lba_len_flags *lbalen; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; beio->io_len = lbalen->len * cbe_lun->blocksize; beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_arg = (lbalen->flags & SSC_IMMED) != 0; beio->bio_cmd = BIO_FLUSH; beio->ds_trans_type = DEVSTAT_NO_DATA; DPRINTF("SYNC\n"); be_lun->lun_flush(be_lun, beio); } static void ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); if ((io->io_hdr.flags & CTL_FLAG_ABORT) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { ctl_config_write_done(io); return; } ctl_be_block_config_write(io); } static void ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_lba_len_flags *lbalen; uint64_t len_left, lba; uint32_t pb, pbo, adj; int i, seglen; uint8_t *buf, *end; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; lbalen = ARGS(beio->io); if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) || (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) { ctl_free_beio(beio); ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 0, /*bit*/ 0); ctl_config_write_done(io); return; } if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) { beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize; beio->bio_cmd = BIO_DELETE; beio->ds_trans_type = DEVSTAT_FREE; be_lun->unmap(be_lun, beio); return; } beio->bio_cmd = BIO_WRITE; beio->ds_trans_type = DEVSTAT_WRITE; DPRINTF("WRITE SAME at LBA %jx len %u\n", (uintmax_t)lbalen->lba, lbalen->len); pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp; if (be_lun->cbe_lun.pblockoff > 0) pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff; else pbo = 0; len_left = (uint64_t)lbalen->len * cbe_lun->blocksize; for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) { /* * Setup the S/G entry for this chunk. */ seglen = MIN(CTLBLK_MAX_SEG, len_left); if (pb > cbe_lun->blocksize) { adj = ((lbalen->lba + lba) * cbe_lun->blocksize + seglen - pbo) % pb; if (seglen > adj) seglen -= adj; else seglen -= seglen % cbe_lun->blocksize; } else seglen -= seglen % cbe_lun->blocksize; beio->sg_segs[i].len = seglen; beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK); DPRINTF("segment %d addr %p len %zd\n", i, beio->sg_segs[i].addr, beio->sg_segs[i].len); beio->num_segs++; len_left -= seglen; buf = beio->sg_segs[i].addr; end = buf + seglen; for (; buf < end; buf += cbe_lun->blocksize) { if (lbalen->flags & SWS_NDOB) { memset(buf, 0, cbe_lun->blocksize); } else { memcpy(buf, io->scsiio.kern_data_ptr, cbe_lun->blocksize); } if (lbalen->flags & SWS_LBDATA) scsi_ulto4b(lbalen->lba + lba, buf); lba++; } } beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_len = lba * cbe_lun->blocksize; /* We can not do all in one run. Correct and schedule rerun. */ if (len_left > 0) { lbalen->lba += lba; lbalen->len -= lba; beio->beio_cont = ctl_be_block_cw_done_ws; } be_lun->dispatch(be_lun, beio); } static void ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_ptr_len_flags *ptrlen; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) { ctl_free_beio(beio); ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 0, /*command*/ 1, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); ctl_config_write_done(io); return; } beio->io_len = 0; beio->io_offset = -1; beio->bio_cmd = BIO_DELETE; beio->ds_trans_type = DEVSTAT_FREE; DPRINTF("UNMAP\n"); be_lun->unmap(be_lun, beio); } static void ctl_be_block_cr_done(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); ctl_config_read_done(io); } static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; DPRINTF("entered\n"); softc = be_lun->softc; beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; beio->beio_cont = ctl_be_block_cr_done; PRIV(io)->ptr = (void *)beio; switch (io->scsiio.cdb[0]) { case SERVICE_ACTION_IN: /* GET LBA STATUS */ beio->bio_cmd = -1; beio->ds_trans_type = DEVSTAT_NO_DATA; beio->ds_tag_type = DEVSTAT_TAG_ORDERED; beio->io_len = 0; if (be_lun->get_lba_status) be_lun->get_lba_status(be_lun, beio); else ctl_be_block_cr_done(beio); break; default: panic("Unhandled CDB type %#x", io->scsiio.cdb[0]); break; } } static void ctl_be_block_cw_done(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); ctl_config_write_done(io); } static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; DPRINTF("entered\n"); softc = be_lun->softc; beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; beio->beio_cont = ctl_be_block_cw_done; switch (io->scsiio.tag_type) { case CTL_TAG_ORDERED: beio->ds_tag_type = DEVSTAT_TAG_ORDERED; break; case CTL_TAG_HEAD_OF_QUEUE: beio->ds_tag_type = DEVSTAT_TAG_HEAD; break; case CTL_TAG_UNTAGGED: case CTL_TAG_SIMPLE: case CTL_TAG_ACA: default: beio->ds_tag_type = DEVSTAT_TAG_SIMPLE; break; } PRIV(io)->ptr = (void *)beio; switch (io->scsiio.cdb[0]) { case SYNCHRONIZE_CACHE: case SYNCHRONIZE_CACHE_16: ctl_be_block_cw_dispatch_sync(be_lun, io); break; case WRITE_SAME_10: case WRITE_SAME_16: ctl_be_block_cw_dispatch_ws(be_lun, io); break; case UNMAP: ctl_be_block_cw_dispatch_unmap(be_lun, io); break; default: panic("Unhandled CDB type %#x", io->scsiio.cdb[0]); break; } } SDT_PROBE_DEFINE1(cbb, , read, start, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , write, start, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , read, alloc_done, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , write, alloc_done, "uint64_t"); static void ctl_be_block_next(struct ctl_be_block_io *beio) { struct ctl_be_block_lun *be_lun; union ctl_io *io; io = beio->io; be_lun = beio->lun; ctl_free_beio(beio); if ((io->io_hdr.flags & CTL_FLAG_ABORT) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { ctl_data_submit_done(io); return; } io->io_hdr.status &= ~CTL_STATUS_MASK; io->io_hdr.status |= CTL_STATUS_NONE; mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); } static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; struct ctl_lba_len_flags *lbalen; struct ctl_ptr_len_flags *bptrlen; uint64_t len_left, lbas; int i; softc = be_lun->softc; DPRINTF("entered\n"); lbalen = ARGS(io); if (lbalen->flags & CTL_LLF_WRITE) { SDT_PROBE0(cbb, , write, start); } else { SDT_PROBE0(cbb, , read, start); } beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; bptrlen = PRIV(io); bptrlen->ptr = (void *)beio; switch (io->scsiio.tag_type) { case CTL_TAG_ORDERED: beio->ds_tag_type = DEVSTAT_TAG_ORDERED; break; case CTL_TAG_HEAD_OF_QUEUE: beio->ds_tag_type = DEVSTAT_TAG_HEAD; break; case CTL_TAG_UNTAGGED: case CTL_TAG_SIMPLE: case CTL_TAG_ACA: default: beio->ds_tag_type = DEVSTAT_TAG_SIMPLE; break; } if (lbalen->flags & CTL_LLF_WRITE) { beio->bio_cmd = BIO_WRITE; beio->ds_trans_type = DEVSTAT_WRITE; } else { beio->bio_cmd = BIO_READ; beio->ds_trans_type = DEVSTAT_READ; } DPRINTF("%s at LBA %jx len %u @%ju\n", (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len); if (lbalen->flags & CTL_LLF_COMPARE) lbas = CTLBLK_HALF_IO_SIZE; else lbas = CTLBLK_MAX_IO_SIZE; lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize); beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize; beio->io_len = lbas * cbe_lun->blocksize; bptrlen->len += lbas; for (i = 0, len_left = beio->io_len; len_left > 0; i++) { KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)", i, CTLBLK_MAX_SEGS)); /* * Setup the S/G entry for this chunk. */ beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left); beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK); DPRINTF("segment %d addr %p len %zd\n", i, beio->sg_segs[i].addr, beio->sg_segs[i].len); /* Set up second segment for compare operation. */ if (lbalen->flags & CTL_LLF_COMPARE) { beio->sg_segs[i + CTLBLK_HALF_SEGS].len = beio->sg_segs[i].len; beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK); } beio->num_segs++; len_left -= beio->sg_segs[i].len; } if (bptrlen->len < lbalen->len) beio->beio_cont = ctl_be_block_next; io->scsiio.be_move_done = ctl_be_block_move_done; /* For compare we have separate S/G lists for read and datamove. */ if (lbalen->flags & CTL_LLF_COMPARE) io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS]; else io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs; io->scsiio.kern_data_len = beio->io_len; io->scsiio.kern_sg_entries = beio->num_segs; io->io_hdr.flags |= CTL_FLAG_ALLOCATED; /* * For the read case, we need to read the data into our buffers and * then we can send it back to the user. For the write case, we * need to get the data from the user first. */ if (beio->bio_cmd == BIO_READ) { SDT_PROBE0(cbb, , read, alloc_done); be_lun->dispatch(be_lun, beio); } else { SDT_PROBE0(cbb, , write, alloc_done); #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_worker(void *context, int pending) { struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context; struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; union ctl_io *io; struct ctl_be_block_io *beio; DPRINTF("entered\n"); /* * Fetch and process I/Os from all queues. If we detect LUN * CTL_LUN_FLAG_NO_MEDIA status here -- it is result of a race, * so make response maximally opaque to not confuse initiator. */ for (;;) { mtx_lock(&be_lun->queue_lock); io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue); if (io != NULL) { DPRINTF("datamove queue\n"); STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_complete_beio(beio); return; } be_lun->dispatch(be_lun, beio); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue); if (io != NULL) { DPRINTF("config write queue\n"); STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_config_write_done(io); return; } ctl_be_block_cw_dispatch(be_lun, io); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue); if (io != NULL) { DPRINTF("config read queue\n"); STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_config_read_done(io); return; } ctl_be_block_cr_dispatch(be_lun, io); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue); if (io != NULL) { DPRINTF("input queue\n"); STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_data_submit_done(io); return; } ctl_be_block_dispatch(be_lun, io); continue; } /* * If we get here, there is no work left in the queues, so * just break out and let the task queue go to sleep. */ mtx_unlock(&be_lun->queue_lock); break; } } /* * Entry point from CTL to the backend for I/O. We queue everything to a * work thread, so this just puts the I/O on a queue and wakes up the * thread. */ static int ctl_be_block_submit(union ctl_io *io) { struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; DPRINTF("entered\n"); cbe_lun = CTL_BACKEND_LUN(io); be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun; /* * Make sure we only get SCSI I/O. */ KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type " "%#x) encountered", io->io_hdr.io_type)); PRIV(io)->len = 0; mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); return (CTL_RETVAL_COMPLETE); } static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct ctl_be_block_softc *softc; int error; softc = &backend_block_softc; error = 0; switch (cmd) { case CTL_LUN_REQ: { struct ctl_lun_req *lun_req; lun_req = (struct ctl_lun_req *)addr; switch (lun_req->reqtype) { case CTL_LUNREQ_CREATE: error = ctl_be_block_create(softc, lun_req); break; case CTL_LUNREQ_RM: error = ctl_be_block_rm(softc, lun_req); break; case CTL_LUNREQ_MODIFY: error = ctl_be_block_modify(softc, lun_req); break; default: lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "invalid LUN request type %d", lun_req->reqtype); break; } break; } default: error = ENOTTY; break; } return (error); } static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun; struct ctl_be_block_filedata *file_data; struct ctl_lun_create_params *params; const char *value; struct vattr vattr; off_t ps, pss, po, pos, us, uss, uo, uos; int error; cbe_lun = &be_lun->cbe_lun; file_data = &be_lun->backend.file; params = &be_lun->params; be_lun->dev_type = CTL_BE_BLOCK_FILE; be_lun->dispatch = ctl_be_block_dispatch_file; be_lun->lun_flush = ctl_be_block_flush_file; be_lun->get_lba_status = ctl_be_block_gls_file; be_lun->getattr = ctl_be_block_getattr_file; be_lun->unmap = NULL; cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP; error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred); if (error != 0) { snprintf(req->error_str, sizeof(req->error_str), "error calling VOP_GETATTR() for file %s", be_lun->dev_path); return (error); } file_data->cred = crhold(curthread->td_ucred); if (params->lun_size_bytes != 0) be_lun->size_bytes = params->lun_size_bytes; else be_lun->size_bytes = vattr.va_size; /* * For files we can use any logical block size. Prefer 512 bytes * for compatibility reasons. If file's vattr.va_blocksize * (preferred I/O block size) is bigger and multiple to chosen * logical block size -- report it as physical block size. */ if (params->blocksize_bytes != 0) cbe_lun->blocksize = params->blocksize_bytes; else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = 2048; else cbe_lun->blocksize = 512; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); us = ps = vattr.va_blocksize; uo = po = 0; value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL); if (value != NULL) ctl_expand_number(value, &ps); value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL); if (value != NULL) ctl_expand_number(value, &po); pss = ps / cbe_lun->blocksize; pos = po / cbe_lun->blocksize; if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) && ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) { cbe_lun->pblockexp = fls(pss) - 1; cbe_lun->pblockoff = (pss - pos) % pss; } value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL); if (value != NULL) ctl_expand_number(value, &us); value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL); if (value != NULL) ctl_expand_number(value, &uo); uss = us / cbe_lun->blocksize; uos = uo / cbe_lun->blocksize; if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) && ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) { cbe_lun->ublockexp = fls(uss) - 1; cbe_lun->ublockoff = (uss - uos) % uss; } /* * Sanity check. The media size has to be at least one * sector long. */ if (be_lun->size_bytes < cbe_lun->blocksize) { error = EINVAL; snprintf(req->error_str, sizeof(req->error_str), "file %s size %ju < block size %u", be_lun->dev_path, (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize); } cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize; return (error); } static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_lun_create_params *params; struct cdevsw *csw; struct cdev *dev; const char *value; int error, atomic, maxio, ref, unmap, tmp; off_t ps, pss, po, pos, us, uss, uo, uos, otmp; params = &be_lun->params; be_lun->dev_type = CTL_BE_BLOCK_DEV; csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) return (ENXIO); if (strcmp(csw->d_name, "zvol") == 0) { be_lun->dispatch = ctl_be_block_dispatch_zvol; be_lun->get_lba_status = ctl_be_block_gls_zvol; atomic = maxio = CTLBLK_MAX_IO_SIZE; } else { be_lun->dispatch = ctl_be_block_dispatch_dev; be_lun->get_lba_status = NULL; atomic = 0; maxio = dev->si_iosize_max; if (maxio <= 0) maxio = DFLTPHYS; if (maxio > CTLBLK_MAX_IO_SIZE) maxio = CTLBLK_MAX_IO_SIZE; } be_lun->lun_flush = ctl_be_block_flush_dev; be_lun->getattr = ctl_be_block_getattr_dev; be_lun->unmap = ctl_be_block_unmap_dev; if (!csw->d_ioctl) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "no d_ioctl for device %s!", be_lun->dev_path); return (ENODEV); } error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD, curthread); if (error) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "error %d returned for DIOCGSECTORSIZE ioctl " "on %s!", error, be_lun->dev_path); return (error); } /* * If the user has asked for a blocksize that is greater than the * backing device's blocksize, we can do it only if the blocksize * the user is asking for is an even multiple of the underlying * device's blocksize. */ if ((params->blocksize_bytes != 0) && (params->blocksize_bytes >= tmp)) { if (params->blocksize_bytes % tmp == 0) { cbe_lun->blocksize = params->blocksize_bytes; } else { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested blocksize %u is not an even " "multiple of backing device blocksize %u", params->blocksize_bytes, tmp); return (EINVAL); } } else if (params->blocksize_bytes != 0) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested blocksize %u < backing device " "blocksize %u", params->blocksize_bytes, tmp); return (EINVAL); } else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = MAX(tmp, 2048); else cbe_lun->blocksize = tmp; error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD, curthread); if (error) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "error %d returned for DIOCGMEDIASIZE " " ioctl on %s!", error, be_lun->dev_path); return (error); } if (params->lun_size_bytes != 0) { if (params->lun_size_bytes > otmp) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested LUN size %ju > backing device " "size %ju", (uintmax_t)params->lun_size_bytes, (uintmax_t)otmp); return (EINVAL); } be_lun->size_bytes = params->lun_size_bytes; } else be_lun->size_bytes = otmp; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD, curthread); if (error) ps = po = 0; else { error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po, FREAD, curthread); if (error) po = 0; } us = ps; uo = po; value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL); if (value != NULL) ctl_expand_number(value, &ps); value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL); if (value != NULL) ctl_expand_number(value, &po); pss = ps / cbe_lun->blocksize; pos = po / cbe_lun->blocksize; if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) && ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) { cbe_lun->pblockexp = fls(pss) - 1; cbe_lun->pblockoff = (pss - pos) % pss; } value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL); if (value != NULL) ctl_expand_number(value, &us); value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL); if (value != NULL) ctl_expand_number(value, &uo); uss = us / cbe_lun->blocksize; uos = uo / cbe_lun->blocksize; if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) && ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) { cbe_lun->ublockexp = fls(uss) - 1; cbe_lun->ublockoff = (uss - uos) % uss; } cbe_lun->atomicblock = atomic / cbe_lun->blocksize; cbe_lun->opttxferlen = maxio / cbe_lun->blocksize; if (be_lun->dispatch == ctl_be_block_dispatch_zvol) { unmap = 1; } else { struct diocgattr_arg arg; strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD, curthread); unmap = (error == 0) ? arg.value.i : 0; } value = dnvlist_get_string(cbe_lun->options, "unmap", NULL); if (value != NULL) unmap = (strcmp(value, "on") == 0); if (unmap) cbe_lun->flags |= CTL_LUN_FLAG_UNMAP; else cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP; dev_relthread(dev, ref); return (0); } static int ctl_be_block_close(struct ctl_be_block_lun *be_lun) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; int flags; if (be_lun->vn) { flags = FREAD; if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0) flags |= FWRITE; (void)vn_close(be_lun->vn, flags, NOCRED, curthread); be_lun->vn = NULL; switch (be_lun->dev_type) { case CTL_BE_BLOCK_DEV: break; case CTL_BE_BLOCK_FILE: if (be_lun->backend.file.cred != NULL) { crfree(be_lun->backend.file.cred); be_lun->backend.file.cred = NULL; } break; case CTL_BE_BLOCK_NONE: break; default: panic("Unexpected backend type %d", be_lun->dev_type); break; } be_lun->dev_type = CTL_BE_BLOCK_NONE; } return (0); } static int ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct nameidata nd; const char *value; int error, flags; error = 0; if (rootvnode == NULL) { snprintf(req->error_str, sizeof(req->error_str), "Root filesystem is not mounted"); return (1); } pwd_ensure_dirs(); value = dnvlist_get_string(cbe_lun->options, "file", NULL); if (value == NULL) { snprintf(req->error_str, sizeof(req->error_str), "no file argument specified"); return (1); } free(be_lun->dev_path, M_CTLBLK); be_lun->dev_path = strdup(value, M_CTLBLK); flags = FREAD; value = dnvlist_get_string(cbe_lun->options, "readonly", NULL); if (value != NULL) { if (strcmp(value, "on") != 0) flags |= FWRITE; } else if (cbe_lun->lun_type == T_DIRECT) flags |= FWRITE; again: NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread); error = vn_open(&nd, &flags, 0, NULL); if ((error == EROFS || error == EACCES) && (flags & FWRITE)) { flags &= ~FWRITE; goto again; } if (error) { /* * This is the only reasonable guess we can make as far as * path if the user doesn't give us a fully qualified path. * If they want to specify a file, they need to specify the * full path. */ if (be_lun->dev_path[0] != '/') { char *dev_name; asprintf(&dev_name, M_CTLBLK, "/dev/%s", be_lun->dev_path); free(be_lun->dev_path, M_CTLBLK); be_lun->dev_path = dev_name; goto again; } snprintf(req->error_str, sizeof(req->error_str), "error opening %s: %d", be_lun->dev_path, error); return (error); } if (flags & FWRITE) cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY; else cbe_lun->flags |= CTL_LUN_FLAG_READONLY; NDFREE(&nd, NDF_ONLY_PNBUF); be_lun->vn = nd.ni_vp; /* We only support disks and files. */ if (vn_isdisk(be_lun->vn, &error)) { error = ctl_be_block_open_dev(be_lun, req); } else if (be_lun->vn->v_type == VREG) { error = ctl_be_block_open_file(be_lun, req); } else { error = EINVAL; snprintf(req->error_str, sizeof(req->error_str), "%s is not a disk or plain file", be_lun->dev_path); } VOP_UNLOCK(be_lun->vn, 0); if (error != 0) ctl_be_block_close(be_lun); cbe_lun->serseq = CTL_LUN_SERSEQ_OFF; if (be_lun->dispatch != ctl_be_block_dispatch_dev) cbe_lun->serseq = CTL_LUN_SERSEQ_READ; value = dnvlist_get_string(cbe_lun->options, "serseq", NULL); if (value != NULL && strcmp(value, "on") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_ON; else if (value != NULL && strcmp(value, "read") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_READ; else if (value != NULL && strcmp(value, "off") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_OFF; return (0); } static int ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun; struct ctl_be_block_lun *be_lun; struct ctl_lun_create_params *params; char num_thread_str[16]; char tmpstr[32]; const char *value; int retval, num_threads; int tmp_num_threads; params = &req->reqdata.create; retval = 0; req->status = CTL_LUN_OK; be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK); cbe_lun = &be_lun->cbe_lun; cbe_lun->be_lun = be_lun; be_lun->params = req->reqdata.create; be_lun->softc = softc; STAILQ_INIT(&be_lun->input_queue); STAILQ_INIT(&be_lun->config_read_queue); STAILQ_INIT(&be_lun->config_write_queue); STAILQ_INIT(&be_lun->datamove_queue); sprintf(be_lun->lunname, "cblk%d", softc->num_luns); mtx_init(&be_lun->io_lock, "cblk io lock", NULL, MTX_DEF); mtx_init(&be_lun->queue_lock, "cblk queue lock", NULL, MTX_DEF); cbe_lun->options = nvlist_clone(req->args_nvl); be_lun->lun_zone = uma_zcreate(be_lun->lunname, CTLBLK_MAX_SEG, NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0); if (be_lun->lun_zone == NULL) { snprintf(req->error_str, sizeof(req->error_str), "error allocating UMA zone"); goto bailout_error; } if (params->flags & CTL_LUN_FLAG_DEV_TYPE) cbe_lun->lun_type = params->device_type; else cbe_lun->lun_type = T_DIRECT; be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED; cbe_lun->flags = 0; value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL); if (value != NULL) { if (strcmp(value, "primary") == 0) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; } else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; if (cbe_lun->lun_type == T_DIRECT || cbe_lun->lun_type == T_CDROM) { be_lun->size_bytes = params->lun_size_bytes; if (params->blocksize_bytes != 0) cbe_lun->blocksize = params->blocksize_bytes; else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = 2048; else cbe_lun->blocksize = 512; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) || control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) { retval = ctl_be_block_open(be_lun, req); if (retval != 0) { retval = 0; req->status = CTL_LUN_WARNING; } } num_threads = cbb_num_threads; } else { num_threads = 1; } value = dnvlist_get_string(cbe_lun->options, "num_threads", NULL); if (value != NULL) { tmp_num_threads = strtol(value, NULL, 0); /* * We don't let the user specify less than one * thread, but hope he's clueful enough not to * specify 1000 threads. */ if (tmp_num_threads < 1) { snprintf(req->error_str, sizeof(req->error_str), "invalid number of threads %s", num_thread_str); goto bailout_error; } num_threads = tmp_num_threads; } if (be_lun->vn == NULL) cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; /* Tell the user the blocksize we ended up using */ params->lun_size_bytes = be_lun->size_bytes; params->blocksize_bytes = cbe_lun->blocksize; if (params->flags & CTL_LUN_FLAG_ID_REQ) { cbe_lun->req_lun_id = params->req_lun_id; cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ; } else cbe_lun->req_lun_id = 0; cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown; cbe_lun->lun_config_status = ctl_be_block_lun_config_status; cbe_lun->be = &ctl_be_block_driver; if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) { snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%04d", softc->num_luns); strncpy((char *)cbe_lun->serial_num, tmpstr, MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr))); /* Tell the user what we used for a serial number */ strncpy((char *)params->serial_num, tmpstr, MIN(sizeof(params->serial_num), sizeof(tmpstr))); } else { strncpy((char *)cbe_lun->serial_num, params->serial_num, MIN(sizeof(cbe_lun->serial_num), sizeof(params->serial_num))); } if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) { snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%04d", softc->num_luns); strncpy((char *)cbe_lun->device_id, tmpstr, MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr))); /* Tell the user what we used for a device ID */ strncpy((char *)params->device_id, tmpstr, MIN(sizeof(params->device_id), sizeof(tmpstr))); } else { strncpy((char *)cbe_lun->device_id, params->device_id, MIN(sizeof(cbe_lun->device_id), sizeof(params->device_id))); } TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun); be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK, taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue); if (be_lun->io_taskqueue == NULL) { snprintf(req->error_str, sizeof(req->error_str), "unable to create taskqueue"); goto bailout_error; } /* * Note that we start the same number of threads by default for * both the file case and the block device case. For the file * case, we need multiple threads to allow concurrency, because the * vnode interface is designed to be a blocking interface. For the * block device case, ZFS zvols at least will block the caller's * context in many instances, and so we need multiple threads to * overcome that problem. Other block devices don't need as many * threads, but they shouldn't cause too many problems. * * If the user wants to just have a single thread for a block * device, he can specify that when the LUN is created, or change * the tunable/sysctl to alter the default number of threads. */ retval = taskqueue_start_threads(&be_lun->io_taskqueue, /*num threads*/num_threads, /*priority*/PUSER, /*thread name*/ "%s taskq", be_lun->lunname); if (retval != 0) goto bailout_error; be_lun->num_threads = num_threads; mtx_lock(&softc->lock); softc->num_luns++; STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links); mtx_unlock(&softc->lock); retval = ctl_add_lun(&be_lun->cbe_lun); if (retval != 0) { mtx_lock(&softc->lock); STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links); softc->num_luns--; mtx_unlock(&softc->lock); snprintf(req->error_str, sizeof(req->error_str), "ctl_add_lun() returned error %d, see dmesg for " "details", retval); retval = 0; goto bailout_error; } mtx_lock(&softc->lock); /* * Tell the config_status routine that we're waiting so it won't * clean up the LUN in the event of an error. */ be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING; while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) { retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0); if (retval == EINTR) break; } be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING; if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) { snprintf(req->error_str, sizeof(req->error_str), "LUN configuration error, see dmesg for details"); STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links); softc->num_luns--; mtx_unlock(&softc->lock); goto bailout_error; } else { params->req_lun_id = cbe_lun->lun_id; } mtx_unlock(&softc->lock); be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id, cbe_lun->blocksize, DEVSTAT_ALL_SUPPORTED, cbe_lun->lun_type | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); return (retval); bailout_error: req->status = CTL_LUN_ERROR; if (be_lun->io_taskqueue != NULL) taskqueue_free(be_lun->io_taskqueue); ctl_be_block_close(be_lun); if (be_lun->dev_path != NULL) free(be_lun->dev_path, M_CTLBLK); if (be_lun->lun_zone != NULL) uma_zdestroy(be_lun->lun_zone); nvlist_destroy(cbe_lun->options); mtx_destroy(&be_lun->queue_lock); mtx_destroy(&be_lun->io_lock); free(be_lun, M_CTLBLK); return (retval); } static int ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_lun_rm_params *params; struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; int retval; params = &req->reqdata.rm; mtx_lock(&softc->lock); STAILQ_FOREACH(be_lun, &softc->lun_list, links) { if (be_lun->cbe_lun.lun_id == params->lun_id) break; } mtx_unlock(&softc->lock); if (be_lun == NULL) { snprintf(req->error_str, sizeof(req->error_str), "LUN %u is not managed by the block backend", params->lun_id); goto bailout_error; } cbe_lun = &be_lun->cbe_lun; retval = ctl_disable_lun(cbe_lun); if (retval != 0) { snprintf(req->error_str, sizeof(req->error_str), "error %d returned from ctl_disable_lun() for " "LUN %d", retval, params->lun_id); goto bailout_error; } if (be_lun->vn != NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); taskqueue_drain_all(be_lun->io_taskqueue); ctl_be_block_close(be_lun); } retval = ctl_invalidate_lun(cbe_lun); if (retval != 0) { snprintf(req->error_str, sizeof(req->error_str), "error %d returned from ctl_invalidate_lun() for " "LUN %d", retval, params->lun_id); goto bailout_error; } mtx_lock(&softc->lock); be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING; while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) { retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0); if (retval == EINTR) break; } be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING; if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) { snprintf(req->error_str, sizeof(req->error_str), "interrupted waiting for LUN to be freed"); mtx_unlock(&softc->lock); goto bailout_error; } STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links); softc->num_luns--; mtx_unlock(&softc->lock); taskqueue_drain_all(be_lun->io_taskqueue); taskqueue_free(be_lun->io_taskqueue); if (be_lun->disk_stats != NULL) devstat_remove_entry(be_lun->disk_stats); uma_zdestroy(be_lun->lun_zone); nvlist_destroy(cbe_lun->options); free(be_lun->dev_path, M_CTLBLK); mtx_destroy(&be_lun->queue_lock); mtx_destroy(&be_lun->io_lock); free(be_lun, M_CTLBLK); req->status = CTL_LUN_OK; return (0); bailout_error: req->status = CTL_LUN_ERROR; return (0); } static int ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_lun_modify_params *params; struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; const char *value; uint64_t oldsize; int error, wasprim; params = &req->reqdata.modify; mtx_lock(&softc->lock); STAILQ_FOREACH(be_lun, &softc->lun_list, links) { if (be_lun->cbe_lun.lun_id == params->lun_id) break; } mtx_unlock(&softc->lock); if (be_lun == NULL) { snprintf(req->error_str, sizeof(req->error_str), "LUN %u is not managed by the block backend", params->lun_id); goto bailout_error; } cbe_lun = &be_lun->cbe_lun; if (params->lun_size_bytes != 0) be_lun->params.lun_size_bytes = params->lun_size_bytes; nvlist_destroy(cbe_lun->options); cbe_lun->options = nvlist_clone(req->args_nvl); wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY); value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL); if (value != NULL) { if (strcmp(value, "primary") == 0) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; else cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY; } else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; else cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY; if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) { if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ctl_lun_primary(cbe_lun); else ctl_lun_secondary(cbe_lun); } oldsize = be_lun->size_blocks; if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) || control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) { if (be_lun->vn == NULL) error = ctl_be_block_open(be_lun, req); else if (vn_isdisk(be_lun->vn, &error)) error = ctl_be_block_open_dev(be_lun, req); else if (be_lun->vn->v_type == VREG) { vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); error = ctl_be_block_open_file(be_lun, req); VOP_UNLOCK(be_lun->vn, 0); } else error = EINVAL; if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) && be_lun->vn != NULL) { cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA; ctl_lun_has_media(cbe_lun); } else if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) == 0 && be_lun->vn == NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); } cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED; } else { if (be_lun->vn != NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); taskqueue_drain_all(be_lun->io_taskqueue); error = ctl_be_block_close(be_lun); } else error = 0; } if (be_lun->size_blocks != oldsize) ctl_lun_capacity_changed(cbe_lun); /* Tell the user the exact size we ended up using */ params->lun_size_bytes = be_lun->size_bytes; req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK; return (0); bailout_error: req->status = CTL_LUN_ERROR; return (0); } static void ctl_be_block_lun_shutdown(void *be_lun) { struct ctl_be_block_lun *lun = be_lun; struct ctl_be_block_softc *softc = lun->softc; mtx_lock(&softc->lock); lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED; if (lun->flags & CTL_BE_BLOCK_LUN_WAITING) wakeup(lun); mtx_unlock(&softc->lock); } static void ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status) { struct ctl_be_block_lun *lun; struct ctl_be_block_softc *softc; lun = (struct ctl_be_block_lun *)be_lun; softc = lun->softc; if (status == CTL_LUN_CONFIG_OK) { mtx_lock(&softc->lock); lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED; if (lun->flags & CTL_BE_BLOCK_LUN_WAITING) wakeup(lun); mtx_unlock(&softc->lock); /* * We successfully added the LUN, attempt to enable it. */ if (ctl_enable_lun(&lun->cbe_lun) != 0) { printf("%s: ctl_enable_lun() failed!\n", __func__); if (ctl_invalidate_lun(&lun->cbe_lun) != 0) { printf("%s: ctl_invalidate_lun() failed!\n", __func__); } } return; } mtx_lock(&softc->lock); lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED; lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR; wakeup(lun); mtx_unlock(&softc->lock); } static int ctl_be_block_config_write(union ctl_io *io) { struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; int retval; DPRINTF("entered\n"); cbe_lun = CTL_BACKEND_LUN(io); be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun; retval = 0; switch (io->scsiio.cdb[0]) { case SYNCHRONIZE_CACHE: case SYNCHRONIZE_CACHE_16: case WRITE_SAME_10: case WRITE_SAME_16: case UNMAP: /* * The upper level CTL code will filter out any CDBs with * the immediate bit set and return the proper error. * * We don't really need to worry about what LBA range the * user asked to be synced out. When they issue a sync * cache command, we'll sync out the whole thing. */ mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); break; case START_STOP_UNIT: { struct scsi_start_stop_unit *cdb; struct ctl_lun_req req; cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb; if ((cdb->how & SSS_PC_MASK) != 0) { ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; } if (cdb->how & SSS_START) { if ((cdb->how & SSS_LOEJ) && be_lun->vn == NULL) { retval = ctl_be_block_open(be_lun, &req); cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED; if (retval == 0) { cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA; ctl_lun_has_media(cbe_lun); } else { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); } } ctl_start_lun(cbe_lun); } else { ctl_stop_lun(cbe_lun); if (cdb->how & SSS_LOEJ) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; cbe_lun->flags |= CTL_LUN_FLAG_EJECTED; ctl_lun_ejected(cbe_lun); if (be_lun->vn != NULL) ctl_be_block_close(be_lun); } } ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; } case PREVENT_ALLOW: ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; default: ctl_set_invalid_opcode(&io->scsiio); ctl_config_write_done(io); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } static int ctl_be_block_config_read(union ctl_io *io) { struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; int retval = 0; DPRINTF("entered\n"); cbe_lun = CTL_BACKEND_LUN(io); be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun; switch (io->scsiio.cdb[0]) { case SERVICE_ACTION_IN: if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) { mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->config_read_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); retval = CTL_RETVAL_QUEUED; break; } ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); ctl_config_read_done(io); retval = CTL_RETVAL_COMPLETE; break; default: ctl_set_invalid_opcode(&io->scsiio); ctl_config_read_done(io); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb) { struct ctl_be_block_lun *lun; int retval; lun = (struct ctl_be_block_lun *)be_lun; retval = sbuf_printf(sb, "\t"); if (retval != 0) goto bailout; retval = sbuf_printf(sb, "%d", lun->num_threads); if (retval != 0) goto bailout; retval = sbuf_printf(sb, "\n"); bailout: return (retval); } static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname) { struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)be_lun; if (lun->getattr == NULL) return (UINT64_MAX); return (lun->getattr(lun, attrname)); } static int ctl_be_block_init(void) { struct ctl_be_block_softc *softc = &backend_block_softc; mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF); softc->beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); STAILQ_INIT(&softc->lun_list); return (0); } static int ctl_be_block_shutdown(void) { struct ctl_be_block_softc *softc = &backend_block_softc; struct ctl_be_block_lun *lun, *next_lun; mtx_lock(&softc->lock); STAILQ_FOREACH_SAFE(lun, &softc->lun_list, links, next_lun) { /* * Drop our lock here. Since ctl_invalidate_lun() can call * back into us, this could potentially lead to a recursive * lock of the same mutex, which would cause a hang. */ mtx_unlock(&softc->lock); ctl_disable_lun(&lun->cbe_lun); ctl_invalidate_lun(&lun->cbe_lun); mtx_lock(&softc->lock); } mtx_unlock(&softc->lock); uma_zdestroy(softc->beio_zone); mtx_destroy(&softc->lock); return (0); } Index: head/sys/dev/md/md.c =================================================================== --- head/sys/dev/md/md.c (revision 356199) +++ head/sys/dev/md/md.c (revision 356200) @@ -1,2163 +1,2157 @@ /*- * SPDX-License-Identifier: (Beerware AND BSD-3-Clause) * * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ * */ /*- * The following functions are based in the vn(4) driver: mdstart_swap(), * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), * and as such under the following copyright: * * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah Hdr: vn.c 1.13 94/04/02 * * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03 */ #include "opt_rootdevname.h" #include "opt_geom.h" #include "opt_md.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MD_MODVER 1 #define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */ #define MD_EXITING 0x20000 /* Worker thread is exiting. */ #define MD_PROVIDERGONE 0x40000 /* Safe to free the softc */ #ifndef MD_NSECT #define MD_NSECT (10000 * 2) #endif struct md_req { unsigned md_unit; /* unit number */ enum md_types md_type; /* type of disk */ off_t md_mediasize; /* size of disk in bytes */ unsigned md_sectorsize; /* sectorsize */ unsigned md_options; /* options */ int md_fwheads; /* firmware heads */ int md_fwsectors; /* firmware sectors */ char *md_file; /* pathname of file to mount */ enum uio_seg md_file_seg; /* location of md_file */ char *md_label; /* label of the device (userspace) */ int *md_units; /* pointer to units array (kernel) */ size_t md_units_nitems; /* items in md_units array */ }; #ifdef COMPAT_FREEBSD32 struct md_ioctl32 { unsigned md_version; unsigned md_unit; enum md_types md_type; uint32_t md_file; off_t md_mediasize; unsigned md_sectorsize; unsigned md_options; uint64_t md_base; int md_fwheads; int md_fwsectors; uint32_t md_label; int md_pad[MDNPAD]; } __attribute__((__packed__)); CTASSERT((sizeof(struct md_ioctl32)) == 436); #define MDIOCATTACH_32 _IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32) #define MDIOCDETACH_32 _IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32) #define MDIOCQUERY_32 _IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32) #define MDIOCRESIZE_32 _IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32) #endif /* COMPAT_FREEBSD32 */ static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk"); static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors"); static int md_debug; SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "Enable md(4) debug messages"); static int md_malloc_wait; SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, "Allow malloc to wait for memory allocations"); #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE) #define MD_ROOT_FSTYPE "ufs" #endif #if defined(MD_ROOT) /* * Preloaded image gets put here. */ #if defined(MD_ROOT_SIZE) /* * We put the mfs_root symbol into the oldmfs section of the kernel object file. * Applications that patch the object with the image can determine * the size looking at the oldmfs section size within the kernel. */ u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs"))); const int mfs_root_size = sizeof(mfs_root); #elif defined(MD_ROOT_MEM) /* MD region already mapped in the memory */ u_char *mfs_root; int mfs_root_size; #else extern volatile u_char __weak_symbol mfs_root; extern volatile u_char __weak_symbol mfs_root_end; __GLOBL(mfs_root); __GLOBL(mfs_root_end); #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root)) #endif #endif static g_init_t g_md_init; static g_fini_t g_md_fini; static g_start_t g_md_start; static g_access_t g_md_access; static void g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp); static g_provgone_t g_md_providergone; static struct cdev *status_dev = NULL; static struct sx md_sx; static struct unrhdr *md_uh; static d_ioctl_t mdctlioctl; static struct cdevsw mdctl_cdevsw = { .d_version = D_VERSION, .d_ioctl = mdctlioctl, .d_name = MD_NAME, }; struct g_class g_md_class = { .name = "MD", .version = G_VERSION, .init = g_md_init, .fini = g_md_fini, .start = g_md_start, .access = g_md_access, .dumpconf = g_md_dumpconf, .providergone = g_md_providergone, }; DECLARE_GEOM_CLASS(g_md_class, g_md); static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list); #define NINDIR (PAGE_SIZE / sizeof(uintptr_t)) #define NMASK (NINDIR-1) static int nshift; static uma_zone_t md_pbuf_zone; struct indir { uintptr_t *array; u_int total; u_int used; u_int shift; }; struct md_s { int unit; LIST_ENTRY(md_s) list; struct bio_queue_head bio_queue; struct mtx queue_mtx; - struct mtx stat_mtx; struct cdev *dev; enum md_types type; off_t mediasize; unsigned sectorsize; unsigned opencount; unsigned fwheads; unsigned fwsectors; char ident[32]; unsigned flags; char name[20]; struct proc *procp; struct g_geom *gp; struct g_provider *pp; int (*start)(struct md_s *sc, struct bio *bp); struct devstat *devstat; /* MD_MALLOC related fields */ struct indir *indir; uma_zone_t uma; /* MD_PRELOAD related fields */ u_char *pl_ptr; size_t pl_len; /* MD_VNODE related fields */ struct vnode *vnode; char file[PATH_MAX]; char label[PATH_MAX]; struct ucred *cred; /* MD_SWAP related fields */ vm_object_t object; }; static struct indir * new_indir(u_int shift) { struct indir *ip; ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); if (ip == NULL) return (NULL); ip->array = malloc(sizeof(uintptr_t) * NINDIR, M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); if (ip->array == NULL) { free(ip, M_MD); return (NULL); } ip->total = NINDIR; ip->shift = shift; return (ip); } static void del_indir(struct indir *ip) { free(ip->array, M_MDSECT); free(ip, M_MD); } static void destroy_indir(struct md_s *sc, struct indir *ip) { int i; for (i = 0; i < NINDIR; i++) { if (!ip->array[i]) continue; if (ip->shift) destroy_indir(sc, (struct indir*)(ip->array[i])); else if (ip->array[i] > 255) uma_zfree(sc->uma, (void *)(ip->array[i])); } del_indir(ip); } /* * This function does the math and allocates the top level "indir" structure * for a device of "size" sectors. */ static struct indir * dimension(off_t size) { off_t rcnt; struct indir *ip; int layer; rcnt = size; layer = 0; while (rcnt > NINDIR) { rcnt /= NINDIR; layer++; } /* * XXX: the top layer is probably not fully populated, so we allocate * too much space for ip->array in here. */ ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO); ip->array = malloc(sizeof(uintptr_t) * NINDIR, M_MDSECT, M_WAITOK | M_ZERO); ip->total = NINDIR; ip->shift = layer * nshift; return (ip); } /* * Read a given sector */ static uintptr_t s_read(struct indir *ip, off_t offset) { struct indir *cip; int idx; uintptr_t up; if (md_debug > 1) printf("s_read(%jd)\n", (intmax_t)offset); up = 0; for (cip = ip; cip != NULL;) { if (cip->shift) { idx = (offset >> cip->shift) & NMASK; up = cip->array[idx]; cip = (struct indir *)up; continue; } idx = offset & NMASK; return (cip->array[idx]); } return (0); } /* * Write a given sector, prune the tree if the value is 0 */ static int s_write(struct indir *ip, off_t offset, uintptr_t ptr) { struct indir *cip, *lip[10]; int idx, li; uintptr_t up; if (md_debug > 1) printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr); up = 0; li = 0; cip = ip; for (;;) { lip[li++] = cip; if (cip->shift) { idx = (offset >> cip->shift) & NMASK; up = cip->array[idx]; if (up != 0) { cip = (struct indir *)up; continue; } /* Allocate branch */ cip->array[idx] = (uintptr_t)new_indir(cip->shift - nshift); if (cip->array[idx] == 0) return (ENOSPC); cip->used++; up = cip->array[idx]; cip = (struct indir *)up; continue; } /* leafnode */ idx = offset & NMASK; up = cip->array[idx]; if (up != 0) cip->used--; cip->array[idx] = ptr; if (ptr != 0) cip->used++; break; } if (cip->used != 0 || li == 1) return (0); li--; while (cip->used == 0 && cip != ip) { li--; idx = (offset >> lip[li]->shift) & NMASK; up = lip[li]->array[idx]; KASSERT(up == (uintptr_t)cip, ("md screwed up")); del_indir(cip); lip[li]->array[idx] = 0; lip[li]->used--; cip = lip[li]; } return (0); } static int g_md_access(struct g_provider *pp, int r, int w, int e) { struct md_s *sc; sc = pp->geom->softc; if (sc == NULL) { if (r <= 0 && w <= 0 && e <= 0) return (0); return (ENXIO); } r += pp->acr; w += pp->acw; e += pp->ace; if ((sc->flags & MD_READONLY) != 0 && w > 0) return (EROFS); if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { sc->opencount = 1; } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { sc->opencount = 0; } return (0); } static void g_md_start(struct bio *bp) { struct md_s *sc; sc = bp->bio_to->geom->softc; if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) { - mtx_lock(&sc->stat_mtx); devstat_start_transaction_bio(sc->devstat, bp); - mtx_unlock(&sc->stat_mtx); } mtx_lock(&sc->queue_mtx); bioq_disksort(&sc->bio_queue, bp); wakeup(sc); mtx_unlock(&sc->queue_mtx); } #define MD_MALLOC_MOVE_ZERO 1 #define MD_MALLOC_MOVE_FILL 2 #define MD_MALLOC_MOVE_READ 3 #define MD_MALLOC_MOVE_WRITE 4 #define MD_MALLOC_MOVE_CMP 5 static int md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize, void *ptr, u_char fill, int op) { struct sf_buf *sf; vm_page_t m, *mp1; char *p, first; off_t *uc; unsigned n; int error, i, ma_offs1, sz, first_read; m = NULL; error = 0; sf = NULL; /* if (op == MD_MALLOC_MOVE_CMP) { gcc */ first = 0; first_read = 0; uc = ptr; mp1 = *mp; ma_offs1 = *ma_offs; /* } */ sched_pin(); for (n = sectorsize; n != 0; n -= sz) { sz = imin(PAGE_SIZE - *ma_offs, n); if (m != **mp) { if (sf != NULL) sf_buf_free(sf); m = **mp; sf = sf_buf_alloc(m, SFB_CPUPRIVATE | (md_malloc_wait ? 0 : SFB_NOWAIT)); if (sf == NULL) { error = ENOMEM; break; } } p = (char *)sf_buf_kva(sf) + *ma_offs; switch (op) { case MD_MALLOC_MOVE_ZERO: bzero(p, sz); break; case MD_MALLOC_MOVE_FILL: memset(p, fill, sz); break; case MD_MALLOC_MOVE_READ: bcopy(ptr, p, sz); cpu_flush_dcache(p, sz); break; case MD_MALLOC_MOVE_WRITE: bcopy(p, ptr, sz); break; case MD_MALLOC_MOVE_CMP: for (i = 0; i < sz; i++, p++) { if (!first_read) { *uc = (u_char)*p; first = *p; first_read = 1; } else if (*p != first) { error = EDOOFUS; break; } } break; default: KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op)); break; } if (error != 0) break; *ma_offs += sz; *ma_offs %= PAGE_SIZE; if (*ma_offs == 0) (*mp)++; ptr = (char *)ptr + sz; } if (sf != NULL) sf_buf_free(sf); sched_unpin(); if (op == MD_MALLOC_MOVE_CMP && error != 0) { *mp = mp1; *ma_offs = ma_offs1; } return (error); } static int md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs, unsigned len, void *ptr, u_char fill, int op) { bus_dma_segment_t *vlist; uint8_t *p, *end, first; off_t *uc; int ma_offs, seg_len; vlist = *pvlist; ma_offs = *pma_offs; uc = ptr; for (; len != 0; len -= seg_len) { seg_len = imin(vlist->ds_len - ma_offs, len); p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs; switch (op) { case MD_MALLOC_MOVE_ZERO: bzero(p, seg_len); break; case MD_MALLOC_MOVE_FILL: memset(p, fill, seg_len); break; case MD_MALLOC_MOVE_READ: bcopy(ptr, p, seg_len); cpu_flush_dcache(p, seg_len); break; case MD_MALLOC_MOVE_WRITE: bcopy(p, ptr, seg_len); break; case MD_MALLOC_MOVE_CMP: end = p + seg_len; first = *uc = *p; /* Confirm all following bytes match the first */ while (++p < end) { if (*p != first) return (EDOOFUS); } break; default: KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op)); break; } ma_offs += seg_len; if (ma_offs == vlist->ds_len) { ma_offs = 0; vlist++; } ptr = (uint8_t *)ptr + seg_len; } *pvlist = vlist; *pma_offs = ma_offs; return (0); } static int mdstart_malloc(struct md_s *sc, struct bio *bp) { u_char *dst; vm_page_t *m; bus_dma_segment_t *vlist; int i, error, error1, ma_offs, notmapped; off_t secno, nsec, uc; uintptr_t sp, osp; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: return (EOPNOTSUPP); } notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0; vlist = (bp->bio_flags & BIO_VLIST) != 0 ? (bus_dma_segment_t *)bp->bio_data : NULL; if (notmapped) { m = bp->bio_ma; ma_offs = bp->bio_ma_offset; dst = NULL; KASSERT(vlist == NULL, ("vlists cannot be unmapped")); } else if (vlist != NULL) { ma_offs = bp->bio_ma_offset; dst = NULL; } else { dst = bp->bio_data; } nsec = bp->bio_length / sc->sectorsize; secno = bp->bio_offset / sc->sectorsize; error = 0; while (nsec--) { osp = s_read(sc->indir, secno); if (bp->bio_cmd == BIO_DELETE) { if (osp != 0) error = s_write(sc->indir, secno, 0); } else if (bp->bio_cmd == BIO_READ) { if (osp == 0) { if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, NULL, 0, MD_MALLOC_MOVE_ZERO); } else if (vlist != NULL) { error = md_malloc_move_vlist(&vlist, &ma_offs, sc->sectorsize, NULL, 0, MD_MALLOC_MOVE_ZERO); } else bzero(dst, sc->sectorsize); } else if (osp <= 255) { if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, NULL, osp, MD_MALLOC_MOVE_FILL); } else if (vlist != NULL) { error = md_malloc_move_vlist(&vlist, &ma_offs, sc->sectorsize, NULL, osp, MD_MALLOC_MOVE_FILL); } else memset(dst, osp, sc->sectorsize); } else { if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_READ); } else if (vlist != NULL) { error = md_malloc_move_vlist(&vlist, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_READ); } else { bcopy((void *)osp, dst, sc->sectorsize); cpu_flush_dcache(dst, sc->sectorsize); } } osp = 0; } else if (bp->bio_cmd == BIO_WRITE) { if (sc->flags & MD_COMPRESS) { if (notmapped) { error1 = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, &uc, 0, MD_MALLOC_MOVE_CMP); i = error1 == 0 ? sc->sectorsize : 0; } else if (vlist != NULL) { error1 = md_malloc_move_vlist(&vlist, &ma_offs, sc->sectorsize, &uc, 0, MD_MALLOC_MOVE_CMP); i = error1 == 0 ? sc->sectorsize : 0; } else { uc = dst[0]; for (i = 1; i < sc->sectorsize; i++) { if (dst[i] != uc) break; } } } else { i = 0; uc = 0; } if (i == sc->sectorsize) { if (osp != uc) error = s_write(sc->indir, secno, uc); } else { if (osp <= 255) { sp = (uintptr_t)uma_zalloc(sc->uma, md_malloc_wait ? M_WAITOK : M_NOWAIT); if (sp == 0) { error = ENOSPC; break; } if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)sp, 0, MD_MALLOC_MOVE_WRITE); } else if (vlist != NULL) { error = md_malloc_move_vlist( &vlist, &ma_offs, sc->sectorsize, (void *)sp, 0, MD_MALLOC_MOVE_WRITE); } else { bcopy(dst, (void *)sp, sc->sectorsize); } error = s_write(sc->indir, secno, sp); } else { if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_WRITE); } else if (vlist != NULL) { error = md_malloc_move_vlist( &vlist, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_WRITE); } else { bcopy(dst, (void *)osp, sc->sectorsize); } osp = 0; } } } else { error = EOPNOTSUPP; } if (osp > 255) uma_zfree(sc->uma, (void*)osp); if (error != 0) break; secno++; if (!notmapped && vlist == NULL) dst += sc->sectorsize; } bp->bio_resid = 0; return (error); } static void mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len) { off_t seg_len; while (offset >= vlist->ds_len) { offset -= vlist->ds_len; vlist++; } while (len != 0) { seg_len = omin(len, vlist->ds_len - offset); bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset), seg_len); offset = 0; src = (uint8_t *)src + seg_len; len -= seg_len; vlist++; } } static void mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len) { off_t seg_len; while (offset >= vlist->ds_len) { offset -= vlist->ds_len; vlist++; } while (len != 0) { seg_len = omin(len, vlist->ds_len - offset); bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst, seg_len); offset = 0; dst = (uint8_t *)dst + seg_len; len -= seg_len; vlist++; } } static int mdstart_preload(struct md_s *sc, struct bio *bp) { uint8_t *p; p = sc->pl_ptr + bp->bio_offset; switch (bp->bio_cmd) { case BIO_READ: if ((bp->bio_flags & BIO_VLIST) != 0) { mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data, bp->bio_ma_offset, bp->bio_length); } else { bcopy(p, bp->bio_data, bp->bio_length); } cpu_flush_dcache(bp->bio_data, bp->bio_length); break; case BIO_WRITE: if ((bp->bio_flags & BIO_VLIST) != 0) { mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data, bp->bio_ma_offset, p, bp->bio_length); } else { bcopy(bp->bio_data, p, bp->bio_length); } break; } bp->bio_resid = 0; return (0); } static int mdstart_vnode(struct md_s *sc, struct bio *bp) { int error; struct uio auio; struct iovec aiov; struct iovec *piov; struct mount *mp; struct vnode *vp; struct buf *pb; bus_dma_segment_t *vlist; struct thread *td; off_t iolen, iostart, len, zerosize; int ma_offs, npages; switch (bp->bio_cmd) { case BIO_READ: auio.uio_rw = UIO_READ; break; case BIO_WRITE: case BIO_DELETE: auio.uio_rw = UIO_WRITE; break; case BIO_FLUSH: break; default: return (EOPNOTSUPP); } td = curthread; vp = sc->vnode; pb = NULL; piov = NULL; ma_offs = bp->bio_ma_offset; len = bp->bio_length; /* * VNODE I/O * * If an error occurs, we set BIO_ERROR but we do not set * B_INVAL because (for a write anyway), the buffer is * still valid. */ if (bp->bio_cmd == BIO_FLUSH) { (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } auio.uio_offset = (vm_ooffset_t)bp->bio_offset; auio.uio_resid = bp->bio_length; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; if (bp->bio_cmd == BIO_DELETE) { /* * Emulate BIO_DELETE by writing zeros. */ zerosize = ZERO_REGION_SIZE - (ZERO_REGION_SIZE % sc->sectorsize); auio.uio_iovcnt = howmany(bp->bio_length, zerosize); piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK); auio.uio_iov = piov; while (len > 0) { piov->iov_base = __DECONST(void *, zero_region); piov->iov_len = len; if (len > zerosize) piov->iov_len = zerosize; len -= piov->iov_len; piov++; } piov = auio.uio_iov; } else if ((bp->bio_flags & BIO_VLIST) != 0) { piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK); auio.uio_iov = piov; vlist = (bus_dma_segment_t *)bp->bio_data; while (len > 0) { piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr + ma_offs); piov->iov_len = vlist->ds_len - ma_offs; if (piov->iov_len > len) piov->iov_len = len; len -= piov->iov_len; ma_offs = 0; vlist++; piov++; } auio.uio_iovcnt = piov - auio.uio_iov; piov = auio.uio_iov; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pb = uma_zalloc(md_pbuf_zone, M_WAITOK); bp->bio_resid = len; unmapped_step: npages = atop(min(MAXPHYS, round_page(len + (ma_offs & PAGE_MASK)))); iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len); KASSERT(iolen > 0, ("zero iolen")); pmap_qenter((vm_offset_t)pb->b_data, &bp->bio_ma[atop(ma_offs)], npages); aiov.iov_base = (void *)((vm_offset_t)pb->b_data + (ma_offs & PAGE_MASK)); aiov.iov_len = iolen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = iolen; } else { aiov.iov_base = bp->bio_data; aiov.iov_len = bp->bio_length; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; } iostart = auio.uio_offset; if (auio.uio_rw == UIO_READ) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(vp, &auio, 0, sc->cred); VOP_UNLOCK(vp, 0); } else { (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error == 0) sc->flags &= ~MD_VERIFY; } /* When MD_CACHE is set, try to avoid double-caching the data. */ if (error == 0 && (sc->flags & MD_CACHE) == 0) VOP_ADVISE(vp, iostart, auio.uio_offset - 1, POSIX_FADV_DONTNEED); if (pb != NULL) { pmap_qremove((vm_offset_t)pb->b_data, npages); if (error == 0) { len -= iolen; bp->bio_resid -= iolen; ma_offs += iolen; if (len > 0) goto unmapped_step; } uma_zfree(md_pbuf_zone, pb); } free(piov, M_MD); if (pb == NULL) bp->bio_resid = auio.uio_resid; return (error); } static int mdstart_swap(struct md_s *sc, struct bio *bp) { vm_page_t m; u_char *p; vm_pindex_t i, lastp; bus_dma_segment_t *vlist; int rv, ma_offs, offs, len, lastend; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: return (EOPNOTSUPP); } p = bp->bio_data; ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ? bp->bio_ma_offset : 0; vlist = (bp->bio_flags & BIO_VLIST) != 0 ? (bus_dma_segment_t *)bp->bio_data : NULL; /* * offs is the offset at which to start operating on the * next (ie, first) page. lastp is the last page on * which we're going to operate. lastend is the ending * position within that last page (ie, PAGE_SIZE if * we're operating on complete aligned pages). */ offs = bp->bio_offset % PAGE_SIZE; lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE; lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1; rv = VM_PAGER_OK; VM_OBJECT_WLOCK(sc->object); vm_object_pip_add(sc->object, 1); for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; m = vm_page_grab(sc->object, i, VM_ALLOC_SYSTEM); if (bp->bio_cmd == BIO_READ) { if (vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, NULL, NULL); if (rv == VM_PAGER_ERROR) { vm_page_free(m); break; } else if (rv == VM_PAGER_FAIL) { /* * Pager does not have the page. Zero * the allocated page, and mark it as * valid. Do not set dirty, the page * can be recreated if thrown out. */ pmap_zero_page(m); vm_page_valid(m); } if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pmap_copy_pages(&m, offs, bp->bio_ma, ma_offs, len); } else if ((bp->bio_flags & BIO_VLIST) != 0) { physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs, vlist, ma_offs, len); cpu_flush_dcache(p, len); } else { physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len); cpu_flush_dcache(p, len); } } else if (bp->bio_cmd == BIO_WRITE) { if (len == PAGE_SIZE || vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, NULL, NULL); if (rv == VM_PAGER_ERROR) { vm_page_free(m); break; } else if (rv == VM_PAGER_FAIL) pmap_zero_page(m); if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pmap_copy_pages(bp->bio_ma, ma_offs, &m, offs, len); } else if ((bp->bio_flags & BIO_VLIST) != 0) { physcopyin_vlist(vlist, ma_offs, VM_PAGE_TO_PHYS(m) + offs, len); } else { physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); } vm_page_valid(m); vm_page_set_dirty(m); } else if (bp->bio_cmd == BIO_DELETE) { if (len == PAGE_SIZE || vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, NULL, NULL); if (rv == VM_PAGER_ERROR) { vm_page_free(m); break; } else if (rv == VM_PAGER_FAIL) { vm_page_free(m); m = NULL; } else { /* Page is valid. */ if (len != PAGE_SIZE) { pmap_zero_page_area(m, offs, len); vm_page_set_dirty(m); } else { vm_pager_page_unswapped(m); vm_page_free(m); m = NULL; } } } if (m != NULL) { vm_page_xunbusy(m); vm_page_reference(m); } /* Actions on further pages start at offset 0 */ p += PAGE_SIZE - offs; offs = 0; ma_offs += len; } vm_object_pip_wakeup(sc->object); VM_OBJECT_WUNLOCK(sc->object); return (rv != VM_PAGER_ERROR ? 0 : ENOSPC); } static int mdstart_null(struct md_s *sc, struct bio *bp) { switch (bp->bio_cmd) { case BIO_READ: bzero(bp->bio_data, bp->bio_length); cpu_flush_dcache(bp->bio_data, bp->bio_length); break; case BIO_WRITE: break; } bp->bio_resid = 0; return (0); } static void md_kthread(void *arg) { struct md_s *sc; struct bio *bp; int error; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); if (sc->type == MD_VNODE) curthread->td_pflags |= TDP_NORUNNINGBUF; for (;;) { mtx_lock(&sc->queue_mtx); if (sc->flags & MD_SHUTDOWN) { sc->flags |= MD_EXITING; mtx_unlock(&sc->queue_mtx); kproc_exit(0); } bp = bioq_takefirst(&sc->bio_queue); if (!bp) { msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0); continue; } mtx_unlock(&sc->queue_mtx); if (bp->bio_cmd == BIO_GETATTR) { int isv = ((sc->flags & MD_VERIFY) != 0); if ((sc->fwsectors && sc->fwheads && (g_handleattr_int(bp, "GEOM::fwsectors", sc->fwsectors) || g_handleattr_int(bp, "GEOM::fwheads", sc->fwheads))) || g_handleattr_int(bp, "GEOM::candelete", 1)) error = -1; else if (sc->ident[0] != '\0' && g_handleattr_str(bp, "GEOM::ident", sc->ident)) error = -1; else if (g_handleattr_int(bp, "MNT::verified", isv)) error = -1; else error = EOPNOTSUPP; } else { error = sc->start(sc, bp); } if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { /* * Devstat uses (bio_bcount, bio_resid) for * determining the length of the completed part of * the i/o. g_io_deliver() will translate from * bio_completed to that, but it also destroys the * bio so we must do our own translation. */ bp->bio_bcount = bp->bio_length; bp->bio_resid = (error == -1 ? bp->bio_bcount : 0); devstat_end_transaction_bio(sc->devstat, bp); } if (error != -1) { bp->bio_completed = bp->bio_length; g_io_deliver(bp, error); } } } static struct md_s * mdfind(int unit) { struct md_s *sc; LIST_FOREACH(sc, &md_softc_list, list) { if (sc->unit == unit) break; } return (sc); } static struct md_s * mdnew(int unit, int *errp, enum md_types type) { struct md_s *sc; int error; *errp = 0; if (unit == -1) unit = alloc_unr(md_uh); else unit = alloc_unr_specific(md_uh, unit); if (unit == -1) { *errp = EBUSY; return (NULL); } sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO); sc->type = type; bioq_init(&sc->bio_queue); mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF); - mtx_init(&sc->stat_mtx, "md stat", NULL, MTX_DEF); sc->unit = unit; sprintf(sc->name, "md%d", unit); LIST_INSERT_HEAD(&md_softc_list, sc, list); error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name); if (error == 0) return (sc); LIST_REMOVE(sc, list); - mtx_destroy(&sc->stat_mtx); mtx_destroy(&sc->queue_mtx); free_unr(md_uh, sc->unit); free(sc, M_MD); *errp = error; return (NULL); } static void mdinit(struct md_s *sc) { struct g_geom *gp; struct g_provider *pp; g_topology_lock(); gp = g_new_geomf(&g_md_class, "md%d", sc->unit); gp->softc = sc; pp = g_new_providerf(gp, "md%d", sc->unit); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; pp->mediasize = sc->mediasize; pp->sectorsize = sc->sectorsize; switch (sc->type) { case MD_MALLOC: case MD_VNODE: case MD_SWAP: pp->flags |= G_PF_ACCEPT_UNMAPPED; break; case MD_PRELOAD: case MD_NULL: break; } sc->gp = gp; sc->pp = pp; g_error_provider(pp, 0); g_topology_unlock(); sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); } static int mdcreate_malloc(struct md_s *sc, struct md_req *mdr) { uintptr_t sp; int error; off_t u; error = 0; if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE)) return (EINVAL); if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize)) return (EINVAL); /* Compression doesn't make sense if we have reserved space */ if (mdr->md_options & MD_RESERVE) mdr->md_options &= ~MD_COMPRESS; if (mdr->md_fwsectors != 0) sc->fwsectors = mdr->md_fwsectors; if (mdr->md_fwheads != 0) sc->fwheads = mdr->md_fwheads; sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE); sc->indir = dimension(sc->mediasize / sc->sectorsize); sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL, 0x1ff, 0); if (mdr->md_options & MD_RESERVE) { off_t nsectors; nsectors = sc->mediasize / sc->sectorsize; for (u = 0; u < nsectors; u++) { sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); if (sp != 0) error = s_write(sc->indir, u, sp); else error = ENOMEM; if (error != 0) break; } } return (error); } static int mdsetcred(struct md_s *sc, struct ucred *cred) { char *tmpbuf; int error = 0; /* * Set credits in our softc */ if (sc->cred) crfree(sc->cred); sc->cred = crhold(cred); /* * Horrible kludge to establish credentials for NFS XXX. */ if (sc->vnode) { struct uio auio; struct iovec aiov; tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK); bzero(&auio, sizeof(auio)); aiov.iov_base = tmpbuf; aiov.iov_len = sc->sectorsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_resid = aiov.iov_len; vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(sc->vnode, &auio, 0, sc->cred); VOP_UNLOCK(sc->vnode, 0); free(tmpbuf, M_TEMP); } return (error); } static int mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td) { struct vattr vattr; struct nameidata nd; char *fname; int error, flags; fname = mdr->md_file; if (mdr->md_file_seg == UIO_USERSPACE) { error = copyinstr(fname, sc->file, sizeof(sc->file), NULL); if (error != 0) return (error); } else if (mdr->md_file_seg == UIO_SYSSPACE) strlcpy(sc->file, fname, sizeof(sc->file)); else return (EDOOFUS); /* * If the user specified that this is a read only device, don't * set the FWRITE mask before trying to open the backing store. */ flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \ | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0); NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td); error = vn_open(&nd, &flags, 0, NULL); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = EINVAL; goto bad; } error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred); if (error != 0) goto bad; if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) { vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(nd.ni_vp)) { /* Forced unmount. */ error = EBADF; goto bad; } } nd.ni_vp->v_vflag |= VV_MD; VOP_UNLOCK(nd.ni_vp, 0); if (mdr->md_fwsectors != 0) sc->fwsectors = mdr->md_fwsectors; if (mdr->md_fwheads != 0) sc->fwheads = mdr->md_fwheads; snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju", (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid); sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE | MD_VERIFY); if (!(flags & FWRITE)) sc->flags |= MD_READONLY; sc->vnode = nd.ni_vp; error = mdsetcred(sc, td->td_ucred); if (error != 0) { sc->vnode = NULL; vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); nd.ni_vp->v_vflag &= ~VV_MD; goto bad; } return (0); bad: VOP_UNLOCK(nd.ni_vp, 0); (void)vn_close(nd.ni_vp, flags, td->td_ucred, td); return (error); } static void g_md_providergone(struct g_provider *pp) { struct md_s *sc = pp->geom->softc; mtx_lock(&sc->queue_mtx); sc->flags |= MD_PROVIDERGONE; wakeup(&sc->flags); mtx_unlock(&sc->queue_mtx); } static int mddestroy(struct md_s *sc, struct thread *td) { if (sc->gp) { g_topology_lock(); g_wither_geom(sc->gp, ENXIO); g_topology_unlock(); mtx_lock(&sc->queue_mtx); while (!(sc->flags & MD_PROVIDERGONE)) msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0); mtx_unlock(&sc->queue_mtx); } if (sc->devstat) { devstat_remove_entry(sc->devstat); sc->devstat = NULL; } mtx_lock(&sc->queue_mtx); sc->flags |= MD_SHUTDOWN; wakeup(sc); while (!(sc->flags & MD_EXITING)) msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10); mtx_unlock(&sc->queue_mtx); - mtx_destroy(&sc->stat_mtx); mtx_destroy(&sc->queue_mtx); if (sc->vnode != NULL) { vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); sc->vnode->v_vflag &= ~VV_MD; VOP_UNLOCK(sc->vnode, 0); (void)vn_close(sc->vnode, sc->flags & MD_READONLY ? FREAD : (FREAD|FWRITE), sc->cred, td); } if (sc->cred != NULL) crfree(sc->cred); if (sc->object != NULL) vm_object_deallocate(sc->object); if (sc->indir) destroy_indir(sc, sc->indir); if (sc->uma) uma_zdestroy(sc->uma); LIST_REMOVE(sc, list); free_unr(md_uh, sc->unit); free(sc, M_MD); return (0); } static int mdresize(struct md_s *sc, struct md_req *mdr) { int error, res; vm_pindex_t oldpages, newpages; switch (sc->type) { case MD_VNODE: case MD_NULL: break; case MD_SWAP: if (mdr->md_mediasize <= 0 || (mdr->md_mediasize % PAGE_SIZE) != 0) return (EDOM); oldpages = OFF_TO_IDX(round_page(sc->mediasize)); newpages = OFF_TO_IDX(round_page(mdr->md_mediasize)); if (newpages < oldpages) { VM_OBJECT_WLOCK(sc->object); vm_object_page_remove(sc->object, newpages, 0, 0); swap_pager_freespace(sc->object, newpages, oldpages - newpages); swap_release_by_cred(IDX_TO_OFF(oldpages - newpages), sc->cred); sc->object->charge = IDX_TO_OFF(newpages); sc->object->size = newpages; VM_OBJECT_WUNLOCK(sc->object); } else if (newpages > oldpages) { res = swap_reserve_by_cred(IDX_TO_OFF(newpages - oldpages), sc->cred); if (!res) return (ENOMEM); if ((mdr->md_options & MD_RESERVE) || (sc->flags & MD_RESERVE)) { error = swap_pager_reserve(sc->object, oldpages, newpages - oldpages); if (error < 0) { swap_release_by_cred( IDX_TO_OFF(newpages - oldpages), sc->cred); return (EDOM); } } VM_OBJECT_WLOCK(sc->object); sc->object->charge = IDX_TO_OFF(newpages); sc->object->size = newpages; VM_OBJECT_WUNLOCK(sc->object); } break; default: return (EOPNOTSUPP); } sc->mediasize = mdr->md_mediasize; g_topology_lock(); g_resize_provider(sc->pp, sc->mediasize); g_topology_unlock(); return (0); } static int mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td) { vm_ooffset_t npage; int error; /* * Range check. Disallow negative sizes and sizes not being * multiple of page size. */ if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) return (EDOM); /* * Allocate an OBJT_SWAP object. * * Note the truncation. */ if ((mdr->md_options & MD_VERIFY) != 0) return (EINVAL); npage = mdr->md_mediasize / PAGE_SIZE; if (mdr->md_fwsectors != 0) sc->fwsectors = mdr->md_fwsectors; if (mdr->md_fwheads != 0) sc->fwheads = mdr->md_fwheads; sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage, VM_PROT_DEFAULT, 0, td->td_ucred); if (sc->object == NULL) return (ENOMEM); sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE); if (mdr->md_options & MD_RESERVE) { if (swap_pager_reserve(sc->object, 0, npage) < 0) { error = EDOM; goto finish; } } error = mdsetcred(sc, td->td_ucred); finish: if (error != 0) { vm_object_deallocate(sc->object); sc->object = NULL; } return (error); } static int mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td) { /* * Range check. Disallow negative sizes and sizes not being * multiple of page size. */ if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) return (EDOM); return (0); } static int kern_mdattach_locked(struct thread *td, struct md_req *mdr) { struct md_s *sc; unsigned sectsize; int error, i; sx_assert(&md_sx, SA_XLOCKED); switch (mdr->md_type) { case MD_MALLOC: case MD_PRELOAD: case MD_VNODE: case MD_SWAP: case MD_NULL: break; default: return (EINVAL); } if (mdr->md_sectorsize == 0) sectsize = DEV_BSIZE; else sectsize = mdr->md_sectorsize; if (sectsize > MAXPHYS || mdr->md_mediasize < sectsize) return (EINVAL); if (mdr->md_options & MD_AUTOUNIT) sc = mdnew(-1, &error, mdr->md_type); else { if (mdr->md_unit > INT_MAX) return (EINVAL); sc = mdnew(mdr->md_unit, &error, mdr->md_type); } if (sc == NULL) return (error); if (mdr->md_label != NULL) error = copyinstr(mdr->md_label, sc->label, sizeof(sc->label), NULL); if (error != 0) goto err_after_new; if (mdr->md_options & MD_AUTOUNIT) mdr->md_unit = sc->unit; sc->mediasize = mdr->md_mediasize; sc->sectorsize = sectsize; error = EDOOFUS; switch (sc->type) { case MD_MALLOC: sc->start = mdstart_malloc; error = mdcreate_malloc(sc, mdr); break; case MD_PRELOAD: /* * We disallow attaching preloaded memory disks via * ioctl. Preloaded memory disks are automatically * attached in g_md_init(). */ error = EOPNOTSUPP; break; case MD_VNODE: sc->start = mdstart_vnode; error = mdcreate_vnode(sc, mdr, td); break; case MD_SWAP: sc->start = mdstart_swap; error = mdcreate_swap(sc, mdr, td); break; case MD_NULL: sc->start = mdstart_null; error = mdcreate_null(sc, mdr, td); break; } err_after_new: if (error != 0) { mddestroy(sc, td); return (error); } /* Prune off any residual fractional sector */ i = sc->mediasize % sc->sectorsize; sc->mediasize -= i; mdinit(sc); return (0); } static int kern_mdattach(struct thread *td, struct md_req *mdr) { int error; sx_xlock(&md_sx); error = kern_mdattach_locked(td, mdr); sx_xunlock(&md_sx); return (error); } static int kern_mddetach_locked(struct thread *td, struct md_req *mdr) { struct md_s *sc; sx_assert(&md_sx, SA_XLOCKED); if (mdr->md_mediasize != 0 || (mdr->md_options & ~MD_FORCE) != 0) return (EINVAL); sc = mdfind(mdr->md_unit); if (sc == NULL) return (ENOENT); if (sc->opencount != 0 && !(sc->flags & MD_FORCE) && !(mdr->md_options & MD_FORCE)) return (EBUSY); return (mddestroy(sc, td)); } static int kern_mddetach(struct thread *td, struct md_req *mdr) { int error; sx_xlock(&md_sx); error = kern_mddetach_locked(td, mdr); sx_xunlock(&md_sx); return (error); } static int kern_mdresize_locked(struct md_req *mdr) { struct md_s *sc; sx_assert(&md_sx, SA_XLOCKED); if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0) return (EINVAL); sc = mdfind(mdr->md_unit); if (sc == NULL) return (ENOENT); if (mdr->md_mediasize < sc->sectorsize) return (EINVAL); if (mdr->md_mediasize < sc->mediasize && !(sc->flags & MD_FORCE) && !(mdr->md_options & MD_FORCE)) return (EBUSY); return (mdresize(sc, mdr)); } static int kern_mdresize(struct md_req *mdr) { int error; sx_xlock(&md_sx); error = kern_mdresize_locked(mdr); sx_xunlock(&md_sx); return (error); } static int kern_mdquery_locked(struct md_req *mdr) { struct md_s *sc; int error; sx_assert(&md_sx, SA_XLOCKED); sc = mdfind(mdr->md_unit); if (sc == NULL) return (ENOENT); mdr->md_type = sc->type; mdr->md_options = sc->flags; mdr->md_mediasize = sc->mediasize; mdr->md_sectorsize = sc->sectorsize; error = 0; if (mdr->md_label != NULL) { error = copyout(sc->label, mdr->md_label, strlen(sc->label) + 1); if (error != 0) return (error); } if (sc->type == MD_VNODE || (sc->type == MD_PRELOAD && mdr->md_file != NULL)) error = copyout(sc->file, mdr->md_file, strlen(sc->file) + 1); return (error); } static int kern_mdquery(struct md_req *mdr) { int error; sx_xlock(&md_sx); error = kern_mdquery_locked(mdr); sx_xunlock(&md_sx); return (error); } /* Copy members that are not userspace pointers. */ #define MD_IOCTL2REQ(mdio, mdr) do { \ (mdr)->md_unit = (mdio)->md_unit; \ (mdr)->md_type = (mdio)->md_type; \ (mdr)->md_mediasize = (mdio)->md_mediasize; \ (mdr)->md_sectorsize = (mdio)->md_sectorsize; \ (mdr)->md_options = (mdio)->md_options; \ (mdr)->md_fwheads = (mdio)->md_fwheads; \ (mdr)->md_fwsectors = (mdio)->md_fwsectors; \ (mdr)->md_units = &(mdio)->md_pad[0]; \ (mdr)->md_units_nitems = nitems((mdio)->md_pad); \ } while(0) /* Copy members that might have been updated */ #define MD_REQ2IOCTL(mdr, mdio) do { \ (mdio)->md_unit = (mdr)->md_unit; \ (mdio)->md_type = (mdr)->md_type; \ (mdio)->md_mediasize = (mdr)->md_mediasize; \ (mdio)->md_sectorsize = (mdr)->md_sectorsize; \ (mdio)->md_options = (mdr)->md_options; \ (mdio)->md_fwheads = (mdr)->md_fwheads; \ (mdio)->md_fwsectors = (mdr)->md_fwsectors; \ } while(0) static int mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct md_req mdr; int error; if (md_debug) printf("mdctlioctl(%s %lx %p %x %p)\n", devtoname(dev), cmd, addr, flags, td); bzero(&mdr, sizeof(mdr)); switch (cmd) { case MDIOCATTACH: case MDIOCDETACH: case MDIOCRESIZE: case MDIOCQUERY: { struct md_ioctl *mdio = (struct md_ioctl *)addr; if (mdio->md_version != MDIOVERSION) return (EINVAL); MD_IOCTL2REQ(mdio, &mdr); mdr.md_file = mdio->md_file; mdr.md_file_seg = UIO_USERSPACE; /* If the file is adjacent to the md_ioctl it's in kernel. */ if ((void *)mdio->md_file == (void *)(mdio + 1)) mdr.md_file_seg = UIO_SYSSPACE; mdr.md_label = mdio->md_label; break; } #ifdef COMPAT_FREEBSD32 case MDIOCATTACH_32: case MDIOCDETACH_32: case MDIOCRESIZE_32: case MDIOCQUERY_32: { struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr; if (mdio->md_version != MDIOVERSION) return (EINVAL); MD_IOCTL2REQ(mdio, &mdr); mdr.md_file = (void *)(uintptr_t)mdio->md_file; mdr.md_file_seg = UIO_USERSPACE; mdr.md_label = (void *)(uintptr_t)mdio->md_label; break; } #endif default: /* Fall through to handler switch. */ break; } error = 0; switch (cmd) { case MDIOCATTACH: #ifdef COMPAT_FREEBSD32 case MDIOCATTACH_32: #endif error = kern_mdattach(td, &mdr); break; case MDIOCDETACH: #ifdef COMPAT_FREEBSD32 case MDIOCDETACH_32: #endif error = kern_mddetach(td, &mdr); break; case MDIOCRESIZE: #ifdef COMPAT_FREEBSD32 case MDIOCRESIZE_32: #endif error = kern_mdresize(&mdr); break; case MDIOCQUERY: #ifdef COMPAT_FREEBSD32 case MDIOCQUERY_32: #endif error = kern_mdquery(&mdr); break; default: error = ENOIOCTL; } switch (cmd) { case MDIOCATTACH: case MDIOCQUERY: { struct md_ioctl *mdio = (struct md_ioctl *)addr; MD_REQ2IOCTL(&mdr, mdio); break; } #ifdef COMPAT_FREEBSD32 case MDIOCATTACH_32: case MDIOCQUERY_32: { struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr; MD_REQ2IOCTL(&mdr, mdio); break; } #endif default: /* Other commands to not alter mdr. */ break; } return (error); } static void md_preloaded(u_char *image, size_t length, const char *name) { struct md_s *sc; int error; sc = mdnew(-1, &error, MD_PRELOAD); if (sc == NULL) return; sc->mediasize = length; sc->sectorsize = DEV_BSIZE; sc->pl_ptr = image; sc->pl_len = length; sc->start = mdstart_preload; if (name != NULL) strlcpy(sc->file, name, sizeof(sc->file)); #ifdef MD_ROOT if (sc->unit == 0) { #ifndef ROOTDEVNAME rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0"; #endif #ifdef MD_ROOT_READONLY sc->flags |= MD_READONLY; #endif } #endif mdinit(sc); if (name != NULL) { printf("%s%d: Preloaded image <%s> %zd bytes at %p\n", MD_NAME, sc->unit, name, length, image); } else { printf("%s%d: Embedded image %zd bytes at %p\n", MD_NAME, sc->unit, length, image); } } static void g_md_init(struct g_class *mp __unused) { caddr_t mod; u_char *ptr, *name, *type; unsigned len; int i; /* figure out log2(NINDIR) */ for (i = NINDIR, nshift = -1; i; nshift++) i >>= 1; mod = NULL; sx_init(&md_sx, "MD config lock"); g_topology_unlock(); md_uh = new_unrhdr(0, INT_MAX, NULL); #ifdef MD_ROOT if (mfs_root_size != 0) { sx_xlock(&md_sx); #ifdef MD_ROOT_MEM md_preloaded(mfs_root, mfs_root_size, NULL); #else md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size, NULL); #endif sx_xunlock(&md_sx); } #endif /* XXX: are preload_* static or do they need Giant ? */ while ((mod = preload_search_next_name(mod)) != NULL) { name = (char *)preload_search_info(mod, MODINFO_NAME); if (name == NULL) continue; type = (char *)preload_search_info(mod, MODINFO_TYPE); if (type == NULL) continue; if (strcmp(type, "md_image") && strcmp(type, "mfs_root")) continue; ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); if (ptr != NULL && len != 0) { sx_xlock(&md_sx); md_preloaded(ptr, len, name); sx_xunlock(&md_sx); } } md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10); status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 0600, MDCTL_NAME); g_topology_lock(); } static void g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct md_s *mp; char *type; mp = gp->softc; if (mp == NULL) return; switch (mp->type) { case MD_MALLOC: type = "malloc"; break; case MD_PRELOAD: type = "preload"; break; case MD_VNODE: type = "vnode"; break; case MD_SWAP: type = "swap"; break; case MD_NULL: type = "null"; break; default: type = "unknown"; break; } if (pp != NULL) { if (indent == NULL) { sbuf_printf(sb, " u %d", mp->unit); sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize); sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads); sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors); sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize); sbuf_printf(sb, " t %s", type); if ((mp->type == MD_VNODE && mp->vnode != NULL) || (mp->type == MD_PRELOAD && mp->file[0] != '\0')) sbuf_printf(sb, " file %s", mp->file); sbuf_printf(sb, " label %s", mp->label); } else { sbuf_printf(sb, "%s%d\n", indent, mp->unit); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->sectorsize); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->fwheads); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->fwsectors); if (mp->ident[0] != '\0') { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", mp->ident); sbuf_printf(sb, "\n"); } sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->mediasize); sbuf_printf(sb, "%s%s\n", indent, (mp->flags & MD_COMPRESS) == 0 ? "off": "on"); sbuf_printf(sb, "%s%s\n", indent, (mp->flags & MD_READONLY) == 0 ? "read-write": "read-only"); sbuf_printf(sb, "%s%s\n", indent, type); if ((mp->type == MD_VNODE && mp->vnode != NULL) || (mp->type == MD_PRELOAD && mp->file[0] != '\0')) { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", mp->file); sbuf_printf(sb, "\n"); } if (mp->type == MD_VNODE) sbuf_printf(sb, "%s%s\n", indent, (mp->flags & MD_CACHE) == 0 ? "off": "on"); sbuf_printf(sb, "%s\n"); } } } static void g_md_fini(struct g_class *mp __unused) { sx_destroy(&md_sx); if (status_dev != NULL) destroy_dev(status_dev); uma_zdestroy(md_pbuf_zone); delete_unrhdr(md_uh); } Index: head/sys/dev/nvdimm/nvdimm_spa.c =================================================================== --- head/sys/dev/nvdimm/nvdimm_spa.c (revision 356199) +++ head/sys/dev/nvdimm/nvdimm_spa.c (revision 356200) @@ -1,627 +1,622 @@ /*- * Copyright (c) 2017, 2018 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2018, 2019 Intel Corporation * * This software was developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define UUID_INITIALIZER_VOLATILE_MEMORY \ {0x7305944f,0xfdda,0x44e3,0xb1,0x6c,{0x3f,0x22,0xd2,0x52,0xe5,0xd0}} #define UUID_INITIALIZER_PERSISTENT_MEMORY \ {0x66f0d379,0xb4f3,0x4074,0xac,0x43,{0x0d,0x33,0x18,0xb7,0x8c,0xdb}} #define UUID_INITIALIZER_CONTROL_REGION \ {0x92f701f6,0x13b4,0x405d,0x91,0x0b,{0x29,0x93,0x67,0xe8,0x23,0x4c}} #define UUID_INITIALIZER_DATA_REGION \ {0x91af0530,0x5d86,0x470e,0xa6,0xb0,{0x0a,0x2d,0xb9,0x40,0x82,0x49}} #define UUID_INITIALIZER_VOLATILE_VIRTUAL_DISK \ {0x77ab535a,0x45fc,0x624b,0x55,0x60,{0xf7,0xb2,0x81,0xd1,0xf9,0x6e}} #define UUID_INITIALIZER_VOLATILE_VIRTUAL_CD \ {0x3d5abd30,0x4175,0x87ce,0x6d,0x64,{0xd2,0xad,0xe5,0x23,0xc4,0xbb}} #define UUID_INITIALIZER_PERSISTENT_VIRTUAL_DISK \ {0x5cea02c9,0x4d07,0x69d3,0x26,0x9f,{0x44,0x96,0xfb,0xe0,0x96,0xf9}} #define UUID_INITIALIZER_PERSISTENT_VIRTUAL_CD \ {0x08018188,0x42cd,0xbb48,0x10,0x0f,{0x53,0x87,0xd5,0x3d,0xed,0x3d}} static struct nvdimm_SPA_uuid_list_elm { const char *u_name; struct uuid u_id; const bool u_usr_acc; } nvdimm_SPA_uuid_list[] = { [SPA_TYPE_VOLATILE_MEMORY] = { .u_name = "VOLA MEM ", .u_id = UUID_INITIALIZER_VOLATILE_MEMORY, .u_usr_acc = true, }, [SPA_TYPE_PERSISTENT_MEMORY] = { .u_name = "PERS MEM", .u_id = UUID_INITIALIZER_PERSISTENT_MEMORY, .u_usr_acc = true, }, [SPA_TYPE_CONTROL_REGION] = { .u_name = "CTRL RG ", .u_id = UUID_INITIALIZER_CONTROL_REGION, .u_usr_acc = false, }, [SPA_TYPE_DATA_REGION] = { .u_name = "DATA RG ", .u_id = UUID_INITIALIZER_DATA_REGION, .u_usr_acc = true, }, [SPA_TYPE_VOLATILE_VIRTUAL_DISK] = { .u_name = "VIRT DSK", .u_id = UUID_INITIALIZER_VOLATILE_VIRTUAL_DISK, .u_usr_acc = true, }, [SPA_TYPE_VOLATILE_VIRTUAL_CD] = { .u_name = "VIRT CD ", .u_id = UUID_INITIALIZER_VOLATILE_VIRTUAL_CD, .u_usr_acc = true, }, [SPA_TYPE_PERSISTENT_VIRTUAL_DISK] = { .u_name = "PV DSK ", .u_id = UUID_INITIALIZER_PERSISTENT_VIRTUAL_DISK, .u_usr_acc = true, }, [SPA_TYPE_PERSISTENT_VIRTUAL_CD] = { .u_name = "PV CD ", .u_id = UUID_INITIALIZER_PERSISTENT_VIRTUAL_CD, .u_usr_acc = true, }, }; enum SPA_mapping_type nvdimm_spa_type_from_name(const char *name) { int j; for (j = 0; j < nitems(nvdimm_SPA_uuid_list); j++) { if (strcmp(name, nvdimm_SPA_uuid_list[j].u_name) != 0) continue; return (j); } return (SPA_TYPE_UNKNOWN); } enum SPA_mapping_type nvdimm_spa_type_from_uuid(struct uuid *uuid) { int j; for (j = 0; j < nitems(nvdimm_SPA_uuid_list); j++) { if (uuidcmp(uuid, &nvdimm_SPA_uuid_list[j].u_id) != 0) continue; return (j); } return (SPA_TYPE_UNKNOWN); } bool nvdimm_spa_type_user_accessible(enum SPA_mapping_type spa_type) { if ((int)spa_type < 0 || spa_type >= nitems(nvdimm_SPA_uuid_list)) return (false); return (nvdimm_SPA_uuid_list[spa_type].u_usr_acc); } static vm_memattr_t nvdimm_spa_memattr(uint64_t efi_mem_flags) { vm_memattr_t mode; if ((efi_mem_flags & EFI_MD_ATTR_WB) != 0) mode = VM_MEMATTR_WRITE_BACK; else if ((efi_mem_flags & EFI_MD_ATTR_WT) != 0) mode = VM_MEMATTR_WRITE_THROUGH; else if ((efi_mem_flags & EFI_MD_ATTR_WC) != 0) mode = VM_MEMATTR_WRITE_COMBINING; else if ((efi_mem_flags & EFI_MD_ATTR_WP) != 0) mode = VM_MEMATTR_WRITE_PROTECTED; else if ((efi_mem_flags & EFI_MD_ATTR_UC) != 0) mode = VM_MEMATTR_UNCACHEABLE; else { if (bootverbose) printf("SPA mapping attr %#lx unsupported\n", efi_mem_flags); mode = VM_MEMATTR_UNCACHEABLE; } return (mode); } static int nvdimm_spa_uio(struct nvdimm_spa_dev *dev, struct uio *uio) { struct vm_page m, *ma; off_t off; vm_memattr_t mattr; int error, n; error = 0; if (dev->spa_kva == NULL) { mattr = dev->spa_memattr; bzero(&m, sizeof(m)); vm_page_initfake(&m, 0, mattr); ma = &m; while (uio->uio_resid > 0) { if (uio->uio_offset >= dev->spa_len) break; off = dev->spa_phys_base + uio->uio_offset; vm_page_updatefake(&m, trunc_page(off), mattr); n = PAGE_SIZE; if (n > uio->uio_resid) n = uio->uio_resid; error = uiomove_fromphys(&ma, off & PAGE_MASK, n, uio); if (error != 0) break; } } else { while (uio->uio_resid > 0) { if (uio->uio_offset >= dev->spa_len) break; n = INT_MAX; if (n > uio->uio_resid) n = uio->uio_resid; if (uio->uio_offset + n > dev->spa_len) n = dev->spa_len - uio->uio_offset; error = uiomove((char *)dev->spa_kva + uio->uio_offset, n, uio); if (error != 0) break; } } return (error); } static int nvdimm_spa_rw(struct cdev *dev, struct uio *uio, int ioflag) { return (nvdimm_spa_uio(dev->si_drv1, uio)); } static int nvdimm_spa_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct nvdimm_spa_dev *dev; int error; dev = cdev->si_drv1; error = 0; switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = DEV_BSIZE; break; case DIOCGMEDIASIZE: *(off_t *)data = dev->spa_len; break; default: error = ENOTTY; break; } return (error); } static int nvdimm_spa_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size, vm_object_t *objp, int nprot) { struct nvdimm_spa_dev *dev; dev = cdev->si_drv1; if (dev->spa_obj == NULL) return (ENXIO); if (*offset >= dev->spa_len || *offset + size < *offset || *offset + size > dev->spa_len) return (EINVAL); vm_object_reference(dev->spa_obj); *objp = dev->spa_obj; return (0); } static struct cdevsw spa_cdevsw = { .d_version = D_VERSION, .d_flags = D_DISK, .d_name = "nvdimm_spa", .d_read = nvdimm_spa_rw, .d_write = nvdimm_spa_rw, .d_ioctl = nvdimm_spa_ioctl, .d_mmap_single = nvdimm_spa_mmap_single, }; static void nvdimm_spa_g_all_unmapped(struct nvdimm_spa_dev *dev, struct bio *bp, int rw) { struct vm_page maa[bp->bio_ma_n]; vm_page_t ma[bp->bio_ma_n]; vm_memattr_t mattr; int i; mattr = dev->spa_memattr; for (i = 0; i < nitems(ma); i++) { bzero(&maa[i], sizeof(maa[i])); vm_page_initfake(&maa[i], dev->spa_phys_base + trunc_page(bp->bio_offset) + PAGE_SIZE * i, mattr); ma[i] = &maa[i]; } if (rw == BIO_READ) pmap_copy_pages(ma, bp->bio_offset & PAGE_MASK, bp->bio_ma, bp->bio_ma_offset, bp->bio_length); else pmap_copy_pages(bp->bio_ma, bp->bio_ma_offset, ma, bp->bio_offset & PAGE_MASK, bp->bio_length); } static void nvdimm_spa_g_thread(void *arg) { struct g_spa *sc; struct bio *bp; struct uio auio; struct iovec aiovec; int error; sc = arg; for (;;) { mtx_lock(&sc->spa_g_mtx); for (;;) { bp = bioq_takefirst(&sc->spa_g_queue); if (bp != NULL) break; msleep(&sc->spa_g_queue, &sc->spa_g_mtx, PRIBIO, "spa_g", 0); if (!sc->spa_g_proc_run) { sc->spa_g_proc_exiting = true; wakeup(&sc->spa_g_queue); mtx_unlock(&sc->spa_g_mtx); kproc_exit(0); } continue; } mtx_unlock(&sc->spa_g_mtx); if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE && bp->bio_cmd != BIO_FLUSH) { error = EOPNOTSUPP; goto completed; } error = 0; if (bp->bio_cmd == BIO_FLUSH) { if (sc->dev->spa_kva != NULL) { pmap_large_map_wb(sc->dev->spa_kva, sc->dev->spa_len); } else { pmap_flush_cache_phys_range( (vm_paddr_t)sc->dev->spa_phys_base, (vm_paddr_t)sc->dev->spa_phys_base + sc->dev->spa_len, sc->dev->spa_memattr); } /* * XXX flush IMC */ goto completed; } if ((bp->bio_flags & BIO_UNMAPPED) != 0) { if (sc->dev->spa_kva != NULL) { aiovec.iov_base = (char *)sc->dev->spa_kva + bp->bio_offset; aiovec.iov_len = bp->bio_length; auio.uio_iov = &aiovec; auio.uio_iovcnt = 1; auio.uio_resid = bp->bio_length; auio.uio_offset = bp->bio_offset; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = bp->bio_cmd == BIO_READ ? UIO_WRITE : UIO_READ; auio.uio_td = curthread; error = uiomove_fromphys(bp->bio_ma, bp->bio_ma_offset, bp->bio_length, &auio); bp->bio_resid = auio.uio_resid; } else { nvdimm_spa_g_all_unmapped(sc->dev, bp, bp->bio_cmd); bp->bio_resid = bp->bio_length; error = 0; } } else { aiovec.iov_base = bp->bio_data; aiovec.iov_len = bp->bio_length; auio.uio_iov = &aiovec; auio.uio_iovcnt = 1; auio.uio_resid = bp->bio_length; auio.uio_offset = bp->bio_offset; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = bp->bio_cmd == BIO_READ ? UIO_READ : UIO_WRITE; auio.uio_td = curthread; error = nvdimm_spa_uio(sc->dev, &auio); bp->bio_resid = auio.uio_resid; } bp->bio_bcount = bp->bio_length; devstat_end_transaction_bio(sc->spa_g_devstat, bp); completed: bp->bio_completed = bp->bio_length; g_io_deliver(bp, error); } } static void nvdimm_spa_g_start(struct bio *bp) { struct g_spa *sc; sc = bp->bio_to->geom->softc; if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { - mtx_lock(&sc->spa_g_stat_mtx); devstat_start_transaction_bio(sc->spa_g_devstat, bp); - mtx_unlock(&sc->spa_g_stat_mtx); } mtx_lock(&sc->spa_g_mtx); bioq_disksort(&sc->spa_g_queue, bp); wakeup(&sc->spa_g_queue); mtx_unlock(&sc->spa_g_mtx); } static int nvdimm_spa_g_access(struct g_provider *pp, int r, int w, int e) { return (0); } static struct g_geom * nvdimm_spa_g_create(struct nvdimm_spa_dev *dev, const char *name); static g_ctl_destroy_geom_t nvdimm_spa_g_destroy_geom; struct g_class nvdimm_spa_g_class = { .name = "SPA", .version = G_VERSION, .start = nvdimm_spa_g_start, .access = nvdimm_spa_g_access, .destroy_geom = nvdimm_spa_g_destroy_geom, }; DECLARE_GEOM_CLASS(nvdimm_spa_g_class, g_spa); int nvdimm_spa_init(struct SPA_mapping *spa, ACPI_NFIT_SYSTEM_ADDRESS *nfitaddr, enum SPA_mapping_type spa_type) { char *name; int error; spa->spa_type = spa_type; spa->spa_nfit_idx = nfitaddr->RangeIndex; spa->dev.spa_domain = ((nfitaddr->Flags & ACPI_NFIT_PROXIMITY_VALID) != 0) ? nfitaddr->ProximityDomain : -1; spa->dev.spa_phys_base = nfitaddr->Address; spa->dev.spa_len = nfitaddr->Length; spa->dev.spa_efi_mem_flags = nfitaddr->MemoryMapping; if (bootverbose) { printf("NVDIMM SPA%d base %#016jx len %#016jx %s fl %#jx\n", spa->spa_nfit_idx, (uintmax_t)spa->dev.spa_phys_base, (uintmax_t)spa->dev.spa_len, nvdimm_SPA_uuid_list[spa_type].u_name, spa->dev.spa_efi_mem_flags); } spa->dev.spa_memattr = nvdimm_spa_memattr(nfitaddr->MemoryMapping); if (!nvdimm_SPA_uuid_list[spa_type].u_usr_acc) return (0); asprintf(&name, M_NVDIMM, "spa%d", spa->spa_nfit_idx); error = nvdimm_spa_dev_init(&spa->dev, name, spa->spa_nfit_idx); free(name, M_NVDIMM); return (error); } int nvdimm_spa_dev_init(struct nvdimm_spa_dev *dev, const char *name, int unit) { struct make_dev_args mda; struct sglist *spa_sg; char *devname; int error, error1; error1 = pmap_large_map(dev->spa_phys_base, dev->spa_len, &dev->spa_kva, dev->spa_memattr); if (error1 != 0) { printf("NVDIMM %s cannot map into KVA, error %d\n", name, error1); dev->spa_kva = NULL; } spa_sg = sglist_alloc(1, M_WAITOK); error = sglist_append_phys(spa_sg, dev->spa_phys_base, dev->spa_len); if (error == 0) { dev->spa_obj = vm_pager_allocate(OBJT_SG, spa_sg, dev->spa_len, VM_PROT_ALL, 0, NULL); if (dev->spa_obj == NULL) { printf("NVDIMM %s failed to alloc vm object", name); sglist_free(spa_sg); } } else { printf("NVDIMM %s failed to init sglist, error %d", name, error); sglist_free(spa_sg); } make_dev_args_init(&mda); mda.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME; mda.mda_devsw = &spa_cdevsw; mda.mda_cr = NULL; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_OPERATOR; mda.mda_mode = 0660; mda.mda_si_drv1 = dev; mda.mda_unit = unit; asprintf(&devname, M_NVDIMM, "nvdimm_%s", name); error = make_dev_s(&mda, &dev->spa_dev, "%s", devname); free(devname, M_NVDIMM); if (error != 0) { printf("NVDIMM %s cannot create devfs node, error %d\n", name, error); if (error1 == 0) error1 = error; } dev->spa_g = nvdimm_spa_g_create(dev, name); if (dev->spa_g == NULL && error1 == 0) error1 = ENXIO; return (error1); } static struct g_geom * nvdimm_spa_g_create(struct nvdimm_spa_dev *dev, const char *name) { struct g_geom *gp; struct g_spa *sc; int error; gp = NULL; sc = malloc(sizeof(struct g_spa), M_NVDIMM, M_WAITOK | M_ZERO); sc->dev = dev; bioq_init(&sc->spa_g_queue); mtx_init(&sc->spa_g_mtx, "spag", NULL, MTX_DEF); - mtx_init(&sc->spa_g_stat_mtx, "spagst", NULL, MTX_DEF); sc->spa_g_proc_run = true; sc->spa_g_proc_exiting = false; error = kproc_create(nvdimm_spa_g_thread, sc, &sc->spa_g_proc, 0, 0, "g_spa"); if (error != 0) { mtx_destroy(&sc->spa_g_mtx); - mtx_destroy(&sc->spa_g_stat_mtx); free(sc, M_NVDIMM); printf("NVDIMM %s cannot create geom worker, error %d\n", name, error); } else { g_topology_lock(); gp = g_new_geomf(&nvdimm_spa_g_class, "%s", name); gp->softc = sc; sc->spa_p = g_new_providerf(gp, "%s", name); sc->spa_p->mediasize = dev->spa_len; sc->spa_p->sectorsize = DEV_BSIZE; sc->spa_p->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE | G_PF_ACCEPT_UNMAPPED; g_error_provider(sc->spa_p, 0); sc->spa_g_devstat = devstat_new_entry("spa", -1, DEV_BSIZE, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); g_topology_unlock(); } return (gp); } void nvdimm_spa_fini(struct SPA_mapping *spa) { nvdimm_spa_dev_fini(&spa->dev); } void nvdimm_spa_dev_fini(struct nvdimm_spa_dev *dev) { if (dev->spa_g != NULL) { g_topology_lock(); nvdimm_spa_g_destroy_geom(NULL, dev->spa_g->class, dev->spa_g); g_topology_unlock(); } if (dev->spa_dev != NULL) { destroy_dev(dev->spa_dev); dev->spa_dev = NULL; } vm_object_deallocate(dev->spa_obj); if (dev->spa_kva != NULL) { pmap_large_unmap(dev->spa_kva, dev->spa_len); dev->spa_kva = NULL; } } static int nvdimm_spa_g_destroy_geom(struct gctl_req *req, struct g_class *cp, struct g_geom *gp) { struct g_spa *sc; sc = gp->softc; mtx_lock(&sc->spa_g_mtx); sc->spa_g_proc_run = false; wakeup(&sc->spa_g_queue); while (!sc->spa_g_proc_exiting) msleep(&sc->spa_g_queue, &sc->spa_g_mtx, PRIBIO, "spa_e", 0); mtx_unlock(&sc->spa_g_mtx); g_topology_assert(); g_wither_geom(gp, ENXIO); sc->spa_p = NULL; if (sc->spa_g_devstat != NULL) { devstat_remove_entry(sc->spa_g_devstat); sc->spa_g_devstat = NULL; } mtx_destroy(&sc->spa_g_mtx); - mtx_destroy(&sc->spa_g_stat_mtx); free(sc, M_NVDIMM); return (0); } Index: head/sys/dev/nvdimm/nvdimm_var.h =================================================================== --- head/sys/dev/nvdimm/nvdimm_var.h (revision 356199) +++ head/sys/dev/nvdimm/nvdimm_var.h (revision 356200) @@ -1,179 +1,178 @@ /*- * Copyright (c) 2017 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2018, 2019 Intel Corporation * * This software was developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __DEV_NVDIMM_VAR_H__ #define __DEV_NVDIMM_VAR_H__ #define NVDIMM_INDEX_BLOCK_SIGNATURE "NAMESPACE_INDEX" struct nvdimm_label_index { char signature[16]; uint8_t flags[3]; uint8_t label_size; uint32_t seq; uint64_t this_offset; uint64_t this_size; uint64_t other_offset; uint64_t label_offset; uint32_t slot_cnt; uint16_t rev_major; uint16_t rev_minor; uint64_t checksum; uint8_t free[0]; }; struct nvdimm_label { struct uuid uuid; char name[64]; uint32_t flags; uint16_t nlabel; uint16_t position; uint64_t set_cookie; uint64_t lba_size; uint64_t dimm_phys_addr; uint64_t raw_size; uint32_t slot; uint8_t alignment; uint8_t reserved[3]; struct uuid type_guid; struct uuid address_abstraction_guid; uint8_t reserved1[88]; uint64_t checksum; }; struct nvdimm_label_entry { SLIST_ENTRY(nvdimm_label_entry) link; struct nvdimm_label label; }; _Static_assert(sizeof(struct nvdimm_label_index) == 72, "Incorrect layout"); _Static_assert(sizeof(struct nvdimm_label) == 256, "Incorrect layout"); typedef uint32_t nfit_handle_t; enum nvdimm_acpi_ivar { NVDIMM_ROOT_IVAR_ACPI_HANDLE, NVDIMM_ROOT_IVAR_DEVICE_HANDLE, NVDIMM_ROOT_IVAR_MAX, }; __BUS_ACCESSOR(nvdimm_root, acpi_handle, NVDIMM_ROOT, ACPI_HANDLE, ACPI_HANDLE) __BUS_ACCESSOR(nvdimm_root, device_handle, NVDIMM_ROOT, DEVICE_HANDLE, nfit_handle_t) struct nvdimm_dev { device_t nv_dev; nfit_handle_t nv_handle; uint64_t **nv_flush_addr; int nv_flush_addr_cnt; uint32_t label_area_size; uint32_t max_label_xfer; struct nvdimm_label_index *label_index; SLIST_HEAD(, nvdimm_label_entry) labels; }; enum SPA_mapping_type { SPA_TYPE_VOLATILE_MEMORY = 0, SPA_TYPE_PERSISTENT_MEMORY = 1, SPA_TYPE_CONTROL_REGION = 2, SPA_TYPE_DATA_REGION = 3, SPA_TYPE_VOLATILE_VIRTUAL_DISK = 4, SPA_TYPE_VOLATILE_VIRTUAL_CD = 5, SPA_TYPE_PERSISTENT_VIRTUAL_DISK= 6, SPA_TYPE_PERSISTENT_VIRTUAL_CD = 7, SPA_TYPE_UNKNOWN = 127, }; struct nvdimm_spa_dev { int spa_domain; vm_memattr_t spa_memattr; uint64_t spa_phys_base; uint64_t spa_len; uint64_t spa_efi_mem_flags; void *spa_kva; struct vm_object *spa_obj; struct cdev *spa_dev; struct g_geom *spa_g; }; struct g_spa { struct nvdimm_spa_dev *dev; struct g_provider *spa_p; struct bio_queue_head spa_g_queue; struct mtx spa_g_mtx; - struct mtx spa_g_stat_mtx; struct devstat *spa_g_devstat; struct proc *spa_g_proc; bool spa_g_proc_run; bool spa_g_proc_exiting; }; struct nvdimm_namespace { SLIST_ENTRY(nvdimm_namespace) link; struct SPA_mapping *spa; struct nvdimm_spa_dev dev; }; struct SPA_mapping { SLIST_ENTRY(SPA_mapping) link; enum SPA_mapping_type spa_type; int spa_nfit_idx; struct nvdimm_spa_dev dev; SLIST_HEAD(, nvdimm_namespace) namespaces; }; MALLOC_DECLARE(M_NVDIMM); void acpi_nfit_get_dimm_ids(ACPI_TABLE_NFIT *nfitbl, nfit_handle_t **listp, int *countp); void acpi_nfit_get_spa_range(ACPI_TABLE_NFIT *nfitbl, uint16_t range_index, ACPI_NFIT_SYSTEM_ADDRESS **spa); void acpi_nfit_get_spa_ranges(ACPI_TABLE_NFIT *nfitbl, ACPI_NFIT_SYSTEM_ADDRESS ***listp, int *countp); void acpi_nfit_get_region_mappings_by_spa_range(ACPI_TABLE_NFIT *nfitbl, uint16_t spa_range_index, ACPI_NFIT_MEMORY_MAP ***listp, int *countp); void acpi_nfit_get_control_region(ACPI_TABLE_NFIT *nfitbl, uint16_t control_region_index, ACPI_NFIT_CONTROL_REGION **out); void acpi_nfit_get_flush_addrs(ACPI_TABLE_NFIT *nfitbl, nfit_handle_t dimm, uint64_t ***listp, int *countp); enum SPA_mapping_type nvdimm_spa_type_from_name(const char *); enum SPA_mapping_type nvdimm_spa_type_from_uuid(struct uuid *); bool nvdimm_spa_type_user_accessible(enum SPA_mapping_type); struct nvdimm_dev *nvdimm_find_by_handle(nfit_handle_t nv_handle); int nvdimm_spa_init(struct SPA_mapping *spa, ACPI_NFIT_SYSTEM_ADDRESS *nfitaddr, enum SPA_mapping_type spa_type); void nvdimm_spa_fini(struct SPA_mapping *spa); int nvdimm_spa_dev_init(struct nvdimm_spa_dev *dev, const char *name, int unit); void nvdimm_spa_dev_fini(struct nvdimm_spa_dev *dev); int nvdimm_create_namespaces(struct SPA_mapping *spa, ACPI_TABLE_NFIT *nfitbl); void nvdimm_destroy_namespaces(struct SPA_mapping *spa); #endif /* __DEV_NVDIMM_VAR_H__ */ Index: head/sys/geom/geom_disk.c =================================================================== --- head/sys/geom/geom_disk.c (revision 356199) +++ head/sys/geom/geom_disk.c (revision 356200) @@ -1,1086 +1,1079 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_disk_softc { - struct mtx done_mtx; struct disk *dp; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; char led[64]; uint32_t state; - struct mtx start_mtx; + struct mtx done_mtx; }; static g_access_t g_disk_access; static g_start_t g_disk_start; static g_ioctl_t g_disk_ioctl; static g_dumpconf_t g_disk_dumpconf; static g_provgone_t g_disk_providergone; static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS); static struct g_class g_disk_class = { .name = G_DISK_CLASS_NAME, .version = G_VERSION, .start = g_disk_start, .access = g_disk_access, .ioctl = g_disk_ioctl, .providergone = g_disk_providergone, .dumpconf = g_disk_dumpconf, }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW, 0, "GEOM_DISK stuff"); DECLARE_GEOM_CLASS(g_disk_class, g_disk); static int g_disk_access(struct g_provider *pp, int r, int w, int e) { struct disk *dp; struct g_disk_softc *sc; int error; g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)", pp->name, r, w, e); g_topology_assert(); sc = pp->private; if ((dp = sc->dp) == NULL || dp->d_destroyed) { /* * Allow decreasing access count even if disk is not * available anymore. */ if (r <= 0 && w <= 0 && e <= 0) return (0); return (ENXIO); } r += pp->acr; w += pp->acw; e += pp->ace; error = 0; if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { /* * It would be better to defer this decision to d_open if * it was able to take flags. */ if (w > 0 && (dp->d_flags & DISKFLAG_WRITE_PROTECT) != 0) error = EROFS; if (error == 0 && dp->d_open != NULL) error = dp->d_open(dp); if (bootverbose && error != 0) printf("Opened disk %s -> %d\n", pp->name, error); if (error != 0) return (error); pp->sectorsize = dp->d_sectorsize; if (dp->d_maxsize == 0) { printf("WARNING: Disk drive %s%d has no d_maxsize\n", dp->d_name, dp->d_unit); dp->d_maxsize = DFLTPHYS; } if (dp->d_delmaxsize == 0) { if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) { printf("WARNING: Disk drive %s%d has no " "d_delmaxsize\n", dp->d_name, dp->d_unit); } dp->d_delmaxsize = dp->d_maxsize; } pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; dp->d_flags |= DISKFLAG_OPEN; /* * Do not invoke resize event when initial size was zero. * Some disks report its size only after first opening. */ if (pp->mediasize == 0) pp->mediasize = dp->d_mediasize; else g_resize_provider(pp, dp->d_mediasize); } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { if (dp->d_close != NULL) { error = dp->d_close(dp); if (error != 0) printf("Closed disk %s -> %d\n", pp->name, error); } sc->state = G_STATE_ACTIVE; if (sc->led[0] != 0) led_set(sc->led, "0"); dp->d_flags &= ~DISKFLAG_OPEN; } return (error); } static void g_disk_kerneldump(struct bio *bp, struct disk *dp) { struct g_kerneldump *gkd; struct g_geom *gp; gkd = (struct g_kerneldump*)bp->bio_data; gp = bp->bio_to->geom; g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)", gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); if (dp->d_dump == NULL) { g_io_deliver(bp, ENODEV); return; } gkd->di.dumper = dp->d_dump; gkd->di.priv = dp; gkd->di.blocksize = dp->d_sectorsize; gkd->di.maxiosize = dp->d_maxsize; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > dp->d_mediasize) gkd->length = dp->d_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_disk_setstate(struct bio *bp, struct g_disk_softc *sc) { const char *cmd; memcpy(&sc->state, bp->bio_data, sizeof(sc->state)); if (sc->led[0] != 0) { switch (sc->state) { case G_STATE_FAILED: cmd = "1"; break; case G_STATE_REBUILD: cmd = "f5"; break; case G_STATE_RESYNC: cmd = "f1"; break; default: cmd = "0"; break; } led_set(sc->led, cmd); } g_io_deliver(bp, 0); } static void g_disk_done(struct bio *bp) { struct bintime now; struct bio *bp2; struct g_disk_softc *sc; /* See "notes" for why we need a mutex here */ /* XXX: will witness accept a mix of Giant/unGiant drivers here ? */ bp2 = bp->bio_parent; sc = bp2->bio_to->private; bp->bio_completed = bp->bio_length - bp->bio_resid; binuptime(&now); mtx_lock(&sc->done_mtx); if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; switch (bp->bio_cmd) { case BIO_ZONE: bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /*FALLTHROUGH*/ case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: devstat_end_transaction_bio_bt(sc->dp->d_devstat, bp, &now); break; default: break; } bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) { mtx_unlock(&sc->done_mtx); bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed; g_io_deliver(bp2, bp2->bio_error); } else mtx_unlock(&sc->done_mtx); g_destroy_bio(bp); } static int g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td) { struct disk *dp; struct g_disk_softc *sc; int error; sc = pp->private; dp = sc->dp; KASSERT(dp != NULL && !dp->d_destroyed, ("g_disk_ioctl(%lx) on destroyed disk %s", cmd, pp->name)); if (dp->d_ioctl == NULL) return (ENOIOCTL); error = dp->d_ioctl(dp, cmd, data, fflag, td); return (error); } static off_t g_disk_maxsize(struct disk *dp, struct bio *bp) { if (bp->bio_cmd == BIO_DELETE) return (dp->d_delmaxsize); return (dp->d_maxsize); } static int g_disk_maxsegs(struct disk *dp, struct bio *bp) { return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1); } static void g_disk_advance(struct disk *dp, struct bio *bp, off_t off) { bp->bio_offset += off; bp->bio_length -= off; if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *seg, *end; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; off += bp->bio_ma_offset; while (off >= seg->ds_len) { KASSERT((seg != end), ("vlist request runs off the end")); off -= seg->ds_len; seg++; } bp->bio_ma_offset = off; bp->bio_ma_n = end - seg; bp->bio_data = (void *)seg; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma += off / PAGE_SIZE; bp->bio_ma_offset += off; bp->bio_ma_offset %= PAGE_SIZE; bp->bio_ma_n -= off / PAGE_SIZE; } else { bp->bio_data += off; } } static void g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset, off_t *plength, int *ppages) { uintptr_t seg_page_base; uintptr_t seg_page_end; off_t offset; off_t length; int seg_pages; offset = *poffset; length = *plength; if (length > seg->ds_len - offset) length = seg->ds_len - offset; seg_page_base = trunc_page(seg->ds_addr + offset); seg_page_end = round_page(seg->ds_addr + offset + length); seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT; if (seg_pages > *ppages) { seg_pages = *ppages; length = (seg_page_base + (seg_pages << PAGE_SHIFT)) - (seg->ds_addr + offset); } *poffset = 0; *plength -= length; *ppages -= seg_pages; } static off_t g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg) { bus_dma_segment_t *seg, *end; off_t residual; off_t offset; int pages; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; residual = bp->bio_length; offset = bp->bio_ma_offset; pages = g_disk_maxsegs(dp, bp); while (residual != 0 && pages != 0) { KASSERT((seg != end), ("vlist limit runs off the end")); g_disk_seg_limit(seg, &offset, &residual, &pages); seg++; } if (pendseg != NULL) *pendseg = seg; return (residual); } static bool g_disk_limit(struct disk *dp, struct bio *bp) { bool limited = false; off_t maxsz; maxsz = g_disk_maxsize(dp, bp); /* * XXX: If we have a stripesize we should really use it here. * Care should be taken in the delete case if this is done * as deletes can be very sensitive to size given how they * are processed. */ if (bp->bio_length > maxsz) { bp->bio_length = maxsz; limited = true; } if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *firstseg, *endseg; off_t residual; firstseg = (bus_dma_segment_t*)bp->bio_data; residual = g_disk_vlist_limit(dp, bp, &endseg); if (residual != 0) { bp->bio_ma_n = endseg - firstseg; bp->bio_length -= residual; limited = true; } } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE); } return (limited); } static void g_disk_start(struct bio *bp) { struct bio *bp2, *bp3; struct disk *dp; struct g_disk_softc *sc; int error; off_t off; biotrack(bp, __func__); sc = bp->bio_to->private; dp = sc->dp; KASSERT(dp != NULL && !dp->d_destroyed, ("g_disk_start(%p) on destroyed disk %s", bp, bp->bio_to->name)); error = EJUSTRETURN; switch(bp->bio_cmd) { case BIO_DELETE: if (!(dp->d_flags & DISKFLAG_CANDELETE)) { error = EOPNOTSUPP; break; } /* fall-through */ case BIO_READ: case BIO_WRITE: KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0, ("unmapped bio not supported by disk %s", dp->d_name)); off = 0; bp3 = NULL; bp2 = g_clone_bio(bp); if (bp2 == NULL) { error = ENOMEM; break; } for (;;) { if (g_disk_limit(dp, bp2)) { off += bp2->bio_length; /* * To avoid a race, we need to grab the next bio * before we schedule this one. See "notes". */ bp3 = g_clone_bio(bp); if (bp3 == NULL) bp->bio_error = ENOMEM; } bp2->bio_done = g_disk_done; bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize; bp2->bio_bcount = bp2->bio_length; bp2->bio_disk = dp; - mtx_lock(&sc->start_mtx); devstat_start_transaction_bio(dp->d_devstat, bp2); - mtx_unlock(&sc->start_mtx); dp->d_strategy(bp2); if (bp3 == NULL) break; bp2 = bp3; bp3 = NULL; g_disk_advance(dp, bp2, off); } break; case BIO_GETATTR: /* Give the driver a chance to override */ if (dp->d_getattr != NULL) { if (bp->bio_disk == NULL) bp->bio_disk = dp; error = dp->d_getattr(bp); if (error != -1) break; error = EJUSTRETURN; } if (g_handleattr_int(bp, "GEOM::candelete", (dp->d_flags & DISKFLAG_CANDELETE) != 0)) break; else if (g_handleattr_int(bp, "GEOM::fwsectors", dp->d_fwsectors)) break; else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads)) break; else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0)) break; else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident)) break; else if (g_handleattr_str(bp, "GEOM::descr", dp->d_descr)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor", dp->d_hba_vendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_device", dp->d_hba_device)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor", dp->d_hba_subvendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice", dp->d_hba_subdevice)) break; else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_disk_kerneldump(bp, dp); else if (!strcmp(bp->bio_attribute, "GEOM::setstate")) g_disk_setstate(bp, sc); else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate", dp->d_rotation_rate)) break; else if (g_handleattr_str(bp, "GEOM::attachment", dp->d_attachment)) break; else error = ENOIOCTL; break; case BIO_FLUSH: g_trace(G_T_BIO, "g_disk_flushcache(%s)", bp->bio_to->name); if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) { error = EOPNOTSUPP; break; } /*FALLTHROUGH*/ case BIO_ZONE: if (bp->bio_cmd == BIO_ZONE) { if (!(dp->d_flags & DISKFLAG_CANZONE)) { error = EOPNOTSUPP; break; } g_trace(G_T_BIO, "g_disk_zone(%s)", bp->bio_to->name); } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_disk_done; bp2->bio_disk = dp; - mtx_lock(&sc->start_mtx); devstat_start_transaction_bio(dp->d_devstat, bp2); - mtx_unlock(&sc->start_mtx); dp->d_strategy(bp2); break; default: error = EOPNOTSUPP; break; } if (error != EJUSTRETURN) g_io_deliver(bp, error); return; } static void g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct bio *bp; struct disk *dp; struct g_disk_softc *sc; char *buf; int res = 0; sc = gp->softc; if (sc == NULL || (dp = sc->dp) == NULL) return; if (indent == NULL) { sbuf_printf(sb, " hd %u", dp->d_fwheads); sbuf_printf(sb, " sc %u", dp->d_fwsectors); return; } if (pp != NULL) { sbuf_printf(sb, "%s%u\n", indent, dp->d_fwheads); sbuf_printf(sb, "%s%u\n", indent, dp->d_fwsectors); /* * "rotationrate" is a little complicated, because the value * returned by the drive might not be the RPM; 0 and 1 are * special cases, and there's also a valid range. */ sbuf_printf(sb, "%s", indent); if (dp->d_rotation_rate == DISK_RR_UNKNOWN) /* Old drives */ sbuf_cat(sb, "unknown"); /* don't report RPM. */ else if (dp->d_rotation_rate == DISK_RR_NON_ROTATING) sbuf_cat(sb, "0"); else if ((dp->d_rotation_rate >= DISK_RR_MIN) && (dp->d_rotation_rate <= DISK_RR_MAX)) sbuf_printf(sb, "%u", dp->d_rotation_rate); else sbuf_cat(sb, "invalid"); sbuf_cat(sb, "\n"); if (dp->d_getattr != NULL) { buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK); bp = g_alloc_bio(); bp->bio_disk = dp; bp->bio_attribute = "GEOM::ident"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; res = dp->d_getattr(bp); sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, res == 0 ? buf : dp->d_ident); sbuf_cat(sb, "\n"); bp->bio_attribute = "GEOM::lunid"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, buf); sbuf_cat(sb, "\n"); } bp->bio_attribute = "GEOM::lunname"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, buf); sbuf_cat(sb, "\n"); } g_destroy_bio(bp); g_free(buf); } else { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, dp->d_ident); sbuf_cat(sb, "\n"); } sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, dp->d_descr); sbuf_cat(sb, "\n"); } } static void g_disk_resize(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_provider *pp; if (flag == EV_CANCEL) return; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (dp->d_destroyed || gp == NULL) return; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->sectorsize != 0 && pp->sectorsize != dp->d_sectorsize) g_wither_provider(pp, ENXIO); else g_resize_provider(pp, dp->d_mediasize); } } static void g_disk_create(void *arg, int flag) { struct g_geom *gp; struct g_provider *pp; struct disk *dp; struct g_disk_softc *sc; struct disk_alias *dap; char tmpstr[80]; if (flag == EV_CANCEL) return; g_topology_assert(); dp = arg; mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_START; /* * If the disk has already gone away, we can just stop here and * call the user's callback to tell him we've cleaned things up. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); if (dp->d_gone != NULL) dp->d_gone(dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); - mtx_init(&sc->start_mtx, "g_disk_start", NULL, MTX_DEF); mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF); sc->dp = dp; gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit); gp->softc = sc; LIST_FOREACH(dap, &dp->d_aliases, da_next) { snprintf(tmpstr, sizeof(tmpstr), "%s%d", dap->da_alias, dp->d_unit); g_geom_add_alias(gp, tmpstr); } pp = g_new_providerf(gp, "%s", gp->name); devstat_remove_entry(pp->stat); pp->stat = NULL; dp->d_devstat->id = pp; pp->mediasize = dp->d_mediasize; pp->sectorsize = dp->d_sectorsize; pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0) pp->flags |= G_PF_ACCEPT_UNMAPPED; if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0) pp->flags |= G_PF_DIRECT_SEND; pp->flags |= G_PF_DIRECT_RECEIVE; if (bootverbose) printf("GEOM: new disk %s\n", gp->name); sysctl_ctx_init(&sc->sysctl_ctx); snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name); sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name, CTLFLAG_RD, 0, tmpstr); if (sc->sysctl_tree != NULL) { SYSCTL_ADD_STRING(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led", CTLFLAG_RWTUN, sc->led, sizeof(sc->led), "LED name"); SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "flags", CTLTYPE_STRING | CTLFLAG_RD, dp, 0, g_disk_sysctl_flags, "A", "Report disk flags"); } pp->private = sc; dp->d_geom = gp; g_error_provider(pp, 0); mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_DONE; /* * If the disk has gone away at this stage, start the withering * process for it. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); g_wither_provider(pp, ENXIO); return; } mtx_pool_unlock(mtxpool_sleep, dp); } /* * We get this callback after all of the consumers have gone away, and just * before the provider is freed. If the disk driver provided a d_gone * callback, let them know that it is okay to free resources -- they won't * be getting any more accesses from GEOM. */ static void g_disk_providergone(struct g_provider *pp) { struct disk *dp; struct g_disk_softc *sc; sc = (struct g_disk_softc *)pp->private; dp = sc->dp; if (dp != NULL && dp->d_gone != NULL) dp->d_gone(dp); if (sc->sysctl_tree != NULL) { sysctl_ctx_free(&sc->sysctl_ctx); sc->sysctl_tree = NULL; } if (sc->led[0] != 0) { led_set(sc->led, "0"); sc->led[0] = 0; } pp->private = NULL; pp->geom->softc = NULL; mtx_destroy(&sc->done_mtx); - mtx_destroy(&sc->start_mtx); g_free(sc); } static void g_disk_destroy(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_disk_softc *sc; struct disk_alias *dap, *daptmp; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (gp != NULL) { sc = gp->softc; if (sc != NULL) sc->dp = NULL; dp->d_geom = NULL; g_wither_geom(gp, ENXIO); } LIST_FOREACH_SAFE(dap, &dp->d_aliases, da_next, daptmp) g_free(dap); g_free(dp); } /* * We only allow printable characters in disk ident, * the rest is converted to 'x'. */ static void g_disk_ident_adjust(char *ident, size_t size) { char *p, tmp[4], newid[DISK_IDENT_SIZE]; newid[0] = '\0'; for (p = ident; *p != '\0'; p++) { if (isprint(*p)) { tmp[0] = *p; tmp[1] = '\0'; } else { snprintf(tmp, sizeof(tmp), "x%02hhx", *(unsigned char *)p); } if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid)) break; } bzero(ident, size); strlcpy(ident, newid, size); } struct disk * disk_alloc(void) { struct disk *dp; dp = g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO); LIST_INIT(&dp->d_aliases); return (dp); } void disk_create(struct disk *dp, int version) { if (version != DISK_VERSION) { printf("WARNING: Attempt to add disk %s%d %s", dp->d_name, dp->d_unit, " using incompatible ABI version of disk(9)\n"); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } if (dp->d_flags & DISKFLAG_RESERVED) { printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n", dp->d_name, dp->d_unit); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy")); KASSERT(dp->d_name != NULL, ("disk_create need d_name")); KASSERT(*dp->d_name != 0, ("disk_create need d_name")); KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long")); if (dp->d_devstat == NULL) dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit, dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); dp->d_geom = NULL; dp->d_init_level = DISK_INIT_NONE; g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident)); g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL); } void disk_destroy(struct disk *dp) { disk_gone(dp); dp->d_destroyed = 1; g_cancel_event(dp); if (dp->d_devstat != NULL) devstat_remove_entry(dp->d_devstat); g_post_event(g_disk_destroy, dp, M_WAITOK, NULL); } void disk_add_alias(struct disk *dp, const char *name) { struct disk_alias *dap; dap = (struct disk_alias *)g_malloc( sizeof(struct disk_alias) + strlen(name) + 1, M_WAITOK); strcpy((char *)(dap + 1), name); dap->da_alias = (const char *)(dap + 1); LIST_INSERT_HEAD(&dp->d_aliases, dap, da_next); } void disk_gone(struct disk *dp) { struct g_geom *gp; struct g_provider *pp; mtx_pool_lock(mtxpool_sleep, dp); /* * Second wither call makes no sense, plus we can not access the list * of providers without topology lock after calling wither once. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); return; } dp->d_goneflag = 1; /* * If we're still in the process of creating this disk (the * g_disk_create() function is still queued, or is in * progress), the init level will not yet be DISK_INIT_DONE. * * If that is the case, g_disk_create() will see d_goneflag * and take care of cleaning things up. * * If the disk has already been created, we default to * withering the provider as usual below. * * If the caller has not set a d_gone() callback, he will * not be any worse off by returning here, because the geom * has not been fully setup in any case. */ if (dp->d_init_level < DISK_INIT_DONE) { mtx_pool_unlock(mtxpool_sleep, dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); gp = dp->d_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_wither_provider(pp, ENXIO); } } void disk_attr_changed(struct disk *dp, const char *attr, int flag) { struct g_geom *gp; struct g_provider *pp; char devnamebuf[128]; gp = dp->d_geom; if (gp != NULL) LIST_FOREACH(pp, &gp->provider, provider) (void)g_attr_changed(pp, attr, flag); snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name, dp->d_unit); devctl_notify("GEOM", "disk", attr, devnamebuf); } void disk_media_changed(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_changed(pp, flag); } } } void disk_media_gone(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_gone(pp, flag); } } } int disk_resize(struct disk *dp, int flag) { if (dp->d_destroyed || dp->d_geom == NULL) return (0); return (g_post_event(g_disk_resize, dp, flag, NULL)); } static void g_kern_disks(void *p, int flag __unused) { struct sbuf *sb; struct g_geom *gp; char *sp; sb = p; sp = ""; g_topology_assert(); LIST_FOREACH(gp, &g_disk_class.geom, geom) { sbuf_printf(sb, "%s%s", sp, gp->name); sp = " "; } sbuf_finish(sb); } static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS) { struct disk *dp; struct sbuf *sb; int error; sb = sbuf_new_auto(); dp = (struct disk *)arg1; sbuf_printf(sb, "%b", dp->d_flags, "\20" "\2OPEN" "\3CANDELETE" "\4CANFLUSHCACHE" "\5UNMAPPEDBIO" "\6DIRECTCOMPLETION" "\10CANZONE" "\11WRITEPROTECT"); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } static int sysctl_disks(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_disks, "A", "names of available disks"); Index: head/sys/geom/geom_io.c =================================================================== --- head/sys/geom/geom_io.c (revision 356199) +++ head/sys/geom/geom_io.c (revision 356200) @@ -1,1088 +1,1079 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int g_io_transient_map_bio(struct bio *bp); static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; /* * Pace is a hint that we've had some trouble recently allocating * bios, so we should back off trying to send I/O down the stack * a bit to let the problem resolve. When pacing, we also turn * off direct dispatch to also reduce memory pressure from I/Os * there, at the expxense of some added latency while the memory * pressures exist. See g_io_schedule_down() for more details * and limitations. */ static volatile u_int __read_mostly pace; static uma_zone_t __read_mostly biozone; #include static void g_bioq_lock(struct g_bioq *bq) { mtx_lock(&bq->bio_queue_lock); } static void g_bioq_unlock(struct g_bioq *bq) { mtx_unlock(&bq->bio_queue_lock); } #if 0 static void g_bioq_destroy(struct g_bioq *bq) { mtx_destroy(&bq->bio_queue_lock); } #endif static void g_bioq_init(struct g_bioq *bq) { TAILQ_INIT(&bq->bio_queue); mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); } static struct bio * g_bioq_first(struct g_bioq *bq) { struct bio *bp; bp = TAILQ_FIRST(&bq->bio_queue); if (bp != NULL) { KASSERT((bp->bio_flags & BIO_ONQUEUE), ("Bio not on queue bp=%p target %p", bp, bq)); bp->bio_flags &= ~BIO_ONQUEUE; TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); bq->bio_queue_length--; } return (bp); } struct bio * g_new_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_new_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return (bp); } struct bio * g_alloc_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return (bp); } void g_destroy_bio(struct bio *bp) { #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif uma_zfree(biozone, bp); } struct bio * g_clone_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); if (bp2 != NULL) { bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; if (bp->bio_cmd == BIO_ZONE) bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) bp2->bio_track_bp = bp->bio_track_bp; #endif bp->bio_children++; } #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return(bp2); } struct bio * g_duplicate_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return(bp2); } void g_reset_bio(struct bio *bp) { bzero(bp, sizeof(*bp)); } void g_io_init() { g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); biozone = uma_zcreate("g_bio", sizeof (struct bio), NULL, NULL, NULL, NULL, 0, 0); } int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_getattr(%s)", attr); bp = g_alloc_bio(); bp->bio_cmd = BIO_GETATTR; bp->bio_done = NULL; bp->bio_attribute = attr; bp->bio_length = *len; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "ggetattr"); *len = bp->bio_completed; g_destroy_bio(bp); return (error); } int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd); bp = g_alloc_bio(); bp->bio_cmd = BIO_ZONE; bp->bio_done = NULL; /* * XXX KDM need to handle report zone data. */ bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args)); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) bp->bio_length = zone_args->zone_params.report.entries_allocated * sizeof(struct disk_zone_rep_entry); else bp->bio_length = 0; g_io_request(bp, cp); error = biowait(bp, "gzone"); bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args)); g_destroy_bio(bp); return (error); } /* * Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that * the upper layers have detected a resource shortage. The lower layers are * advised to stop delaying I/O that they might be holding for performance * reasons and to schedule it (non-trims) or complete it successfully (trims) as * quickly as it can. bio_length is the amount of the shortage. This call * should be non-blocking. bio_resid is used to communicate back if the lower * layers couldn't find bio_length worth of I/O to schedule or discard. A length * of 0 means to do as much as you can (schedule the h/w queues full, discard * all trims). flags are a hint from the upper layers to the lower layers what * operation should be done. */ int g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp) { struct bio *bp; int error; KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0, ("Invalid flags passed to g_io_speedup: %#x", flags)); g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name, shortage, flags); bp = g_new_bio(); if (bp == NULL) return (ENOMEM); bp->bio_cmd = BIO_SPEEDUP; bp->bio_length = shortage; bp->bio_done = NULL; bp->bio_flags |= flags; g_io_request(bp, cp); error = biowait(bp, "gflush"); *resid = bp->bio_resid; g_destroy_bio(bp); return (error); } int g_io_flush(struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); bp = g_alloc_bio(); bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_done = NULL; bp->bio_attribute = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gflush"); g_destroy_bio(bp); return (error); } static int g_io_check(struct bio *bp) { struct g_consumer *cp; struct g_provider *pp; off_t excess; int error; biotrack(bp, __func__); cp = bp->bio_from; pp = bp->bio_to; /* Fail if access counters dont allow the operation */ switch(bp->bio_cmd) { case BIO_READ: case BIO_GETATTR: if (cp->acr == 0) return (EPERM); break; case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: if (cp->acw == 0) return (EPERM); break; case BIO_ZONE: if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) || (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) { if (cp->acr == 0) return (EPERM); } else if (cp->acw == 0) return (EPERM); break; default: return (EPERM); } /* if provider is marked for error, don't disturb. */ if (pp->error) return (pp->error); if (cp->flags & G_CF_ORPHAN) return (ENXIO); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* Zero sectorsize or mediasize is probably a lack of media. */ if (pp->sectorsize == 0 || pp->mediasize == 0) return (ENXIO); /* Reject I/O not on sector boundary */ if (bp->bio_offset % pp->sectorsize) return (EINVAL); /* Reject I/O not integral sector long */ if (bp->bio_length % pp->sectorsize) return (EINVAL); /* Reject requests before or past the end of media. */ if (bp->bio_offset < 0) return (EIO); if (bp->bio_offset > pp->mediasize) return (EIO); /* Truncate requests to the end of providers media. */ excess = bp->bio_offset + bp->bio_length; if (excess > bp->bio_to->mediasize) { KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, ("excess bio %p too short", bp)); excess -= bp->bio_to->mediasize; bp->bio_length -= excess; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; } if (excess > 0) CTR3(KTR_GEOM, "g_down truncated bio " "%p provider %s by %d", bp, bp->bio_to->name, excess); } /* Deliver zero length transfers right here. */ if (bp->bio_length == 0) { CTR2(KTR_GEOM, "g_down terminated 0-length " "bp %p provider %s", bp, bp->bio_to->name); return (0); } if ((bp->bio_flags & BIO_UNMAPPED) != 0 && (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { if ((error = g_io_transient_map_bio(bp)) >= 0) return (error); } break; default: break; } return (EJUSTRETURN); } void g_io_request(struct bio *bp, struct g_consumer *cp) { struct g_provider *pp; - struct mtx *mtxp; int direct, error, first; uint8_t cmd; biotrack(bp, __func__); KASSERT(cp != NULL, ("NULL cp in g_io_request")); KASSERT(bp != NULL, ("NULL bp in g_io_request")); pp = cp->provider; KASSERT(pp != NULL, ("consumer not attached in g_io_request")); #ifdef DIAGNOSTIC KASSERT(bp->bio_driver1 == NULL, ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_driver2 == NULL, ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_pflags == 0, ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); /* * Remember consumer's private fields, so we can detect if they were * modified by the provider. */ bp->_bio_caller1 = bp->bio_caller1; bp->_bio_caller2 = bp->bio_caller2; bp->_bio_cflags = bp->bio_cflags; #endif cmd = bp->bio_cmd; if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_DELETE || cmd == BIO_FLUSH) { KASSERT(bp->bio_data == NULL, ("non-NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, ("wrong offset %jd for sectorsize %u", bp->bio_offset, cp->provider->sectorsize)); KASSERT(bp->bio_length % cp->provider->sectorsize == 0, ("wrong length %jd for sectorsize %u", bp->bio_length, cp->provider->sectorsize)); } g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); bp->bio_from = cp; bp->bio_to = pp; bp->bio_error = 0; bp->bio_completed = 0; KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); + if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&bp->bio_t0); else getbinuptime(&bp->bio_t0); + if (g_collectstats & G_STATS_CONSUMERS) + devstat_start_transaction(cp->stat, &bp->bio_t0); + if (g_collectstats & G_STATS_PROVIDERS) + devstat_start_transaction(pp->stat, &bp->bio_t0); +#ifdef INVARIANTS + atomic_add_int(&cp->nstart, 1); +#endif #ifdef GET_STACK_USAGE direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && !g_is_geom_thread(curthread) && ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && pace == 0; if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif - - /* - * The statistics collection is lockless, as such, but we - * can not update one instance of the statistics from more - * than one thread at a time, so grab the lock first. - */ - mtxp = mtx_pool_find(mtxpool_sleep, pp); - mtx_lock(mtxp); - if (g_collectstats & G_STATS_PROVIDERS) - devstat_start_transaction(pp->stat, &bp->bio_t0); - if (g_collectstats & G_STATS_CONSUMERS) - devstat_start_transaction(cp->stat, &bp->bio_t0); -#ifdef INVARIANTS - cp->nstart++; -#endif - mtx_unlock(mtxp); if (direct) { error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " "provider %s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); return; } bp->bio_to->geom->start(bp); } else { g_bioq_lock(&g_bio_run_down); first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_down.bio_queue_length++; g_bioq_unlock(&g_bio_run_down); /* Pass it on down. */ if (first) wakeup(&g_wait_down); } } void g_io_deliver(struct bio *bp, int error) { struct bintime now; struct g_consumer *cp; struct g_provider *pp; struct mtx *mtxp; int direct, first; biotrack(bp, __func__); KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); pp = bp->bio_to; KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); cp = bp->bio_from; if (cp == NULL) { bp->bio_error = error; bp->bio_done(bp); return; } KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); #ifdef DIAGNOSTIC /* * Some classes - GJournal in particular - can modify bio's * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO * flag means it's an expected behaviour for that particular geom. */ if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { KASSERT(bp->bio_caller1 == bp->_bio_caller1, ("bio_caller1 used by the provider %s", pp->name)); KASSERT(bp->bio_caller2 == bp->_bio_caller2, ("bio_caller2 used by the provider %s", pp->name)); KASSERT(bp->bio_cflags == bp->_bio_cflags, ("bio_cflags used by the provider %s", pp->name)); } #endif KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); KASSERT(bp->bio_completed <= bp->bio_length, ("bio_completed can't be greater than bio_length")); g_trace(G_T_BIO, "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); /* * XXX: next two doesn't belong here */ bp->bio_bcount = bp->bio_length; bp->bio_resid = bp->bio_bcount - bp->bio_completed; #ifdef GET_STACK_USAGE direct = (pp->flags & G_PF_DIRECT_SEND) && (cp->flags & G_CF_DIRECT_RECEIVE) && !g_is_geom_thread(curthread); if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&now); mtxp = mtx_pool_find(mtxpool_sleep, cp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_end_transaction_bio_bt(pp->stat, bp, &now); if (g_collectstats & G_STATS_CONSUMERS) devstat_end_transaction_bio_bt(cp->stat, bp, &now); #ifdef INVARIANTS cp->nend++; #endif mtx_unlock(mtxp); if (error != ENOMEM) { bp->bio_error = error; if (direct) { biodone(bp); } else { g_bioq_lock(&g_bio_run_up); first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_up.bio_queue_length++; g_bioq_unlock(&g_bio_run_up); if (first) wakeup(&g_wait_up); } return; } if (bootverbose) printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); bp->bio_children = 0; bp->bio_inbed = 0; bp->bio_driver1 = NULL; bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); pace = 1; return; } SYSCTL_DECL(_kern_geom); static long transient_maps; SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, &transient_maps, 0, "Total count of the transient mapping requests"); u_int transient_map_retries = 10; SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, &transient_map_retries, 0, "Max count of retries used before giving up on creating transient map"); int transient_map_hard_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, &transient_map_hard_failures, 0, "Failures to establish the transient mapping due to retry attempts " "exhausted"); int transient_map_soft_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, &transient_map_soft_failures, 0, "Count of retried failures to establish the transient mapping"); int inflight_transient_maps; SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, &inflight_transient_maps, 0, "Current count of the active transient maps"); static int g_io_transient_map_bio(struct bio *bp) { vm_offset_t addr; long size; u_int retried; KASSERT(unmapped_buf_allowed, ("unmapped disabled")); size = round_page(bp->bio_ma_offset + bp->bio_length); KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); addr = 0; retried = 0; atomic_add_long(&transient_maps, 1); retry: if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { if (transient_map_retries != 0 && retried >= transient_map_retries) { CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", bp, bp->bio_to->name); atomic_add_int(&transient_map_hard_failures, 1); return (EDEADLK/* XXXKIB */); } else { /* * Naive attempt to quisce the I/O to get more * in-flight requests completed and defragment * the transient_arena. */ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", bp, bp->bio_to->name, retried); pause("g_d_tra", hz / 10); retried++; atomic_add_int(&transient_map_soft_failures, 1); goto retry; } } atomic_add_int(&inflight_transient_maps, 1); pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; bp->bio_flags |= BIO_TRANSIENT_MAPPING; bp->bio_flags &= ~BIO_UNMAPPED; return (EJUSTRETURN); } void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; int error; for(;;) { g_bioq_lock(&g_bio_run_down); bp = g_bioq_first(&g_bio_run_down); if (bp == NULL) { CTR0(KTR_GEOM, "g_down going to sleep"); msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); biotrack(bp, __func__); if (pace != 0) { /* * There has been at least one memory allocation * failure since the last I/O completed. Pause 1ms to * give the system a chance to free up memory. We only * do this once because a large number of allocations * can fail in the direct dispatch case and there's no * relationship between the number of these failures and * the length of the outage. If there's still an outage, * we'll pause again and again until it's * resolved. Older versions paused longer and once per * allocation failure. This was OK for a single threaded * g_down, but with direct dispatch would lead to max of * 10 IOPs for minutes at a time when transient memory * issues prevented allocation for a batch of requests * from the upper layers. * * XXX This pacing is really lame. It needs to be solved * by other methods. This is OK only because the worst * case scenario is so rare. In the worst case scenario * all memory is tied up waiting for I/O to complete * which can never happen since we can't allocate bios * for that I/O. */ CTR0(KTR_GEOM, "g_down pacing self"); pause("g_down", min(hz/1000, 1)); pace = 0; } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name); error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " "%s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); continue; } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); bp->bio_to->geom->start(bp); THREAD_SLEEPING_OK(); } } void g_io_schedule_up(struct thread *tp __unused) { struct bio *bp; for(;;) { g_bioq_lock(&g_bio_run_up); bp = g_bioq_first(&g_bio_run_up); if (bp == NULL) { CTR0(KTR_GEOM, "g_up going to sleep"); msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " "%jd len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); biodone(bp); THREAD_SLEEPING_OK(); } } void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) { struct bio *bp; void *ptr; int errorc; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_read_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; ptr = g_malloc(length, M_WAITOK); bp->bio_data = ptr; g_io_request(bp, cp); errorc = biowait(bp, "gread"); if (error != NULL) *error = errorc; g_destroy_bio(bp); if (errorc) { g_free(ptr); ptr = NULL; } return (ptr); } /* * A read function for use by ffs_sbget when used by GEOM-layer routines. */ int g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size) { struct g_consumer *cp; KASSERT(*bufp == NULL, ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp)); cp = (struct g_consumer *)devfd; /* * Take care not to issue an invalid I/O request. The offset of * the superblock candidate must be multiples of the provider's * sector size, otherwise an FFS can't exist on the provider * anyway. */ if (loc % cp->provider->sectorsize != 0) return (ENOENT); *bufp = g_read_data(cp, loc, size, NULL); if (*bufp == NULL) return (ENOENT); return (0); } int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_write_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "gwrite"); g_destroy_bio(bp); return (error); } /* * A write function for use by ffs_sbput when used by GEOM-layer routines. */ int g_use_g_write_data(void *devfd, off_t loc, void *buf, int size) { return (g_write_data((struct g_consumer *)devfd, loc, buf, size)); } int g_delete_data(struct g_consumer *cp, off_t offset, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize, ("g_delete_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_DELETE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gdelete"); g_destroy_bio(bp); return (error); } void g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix, ...) { #ifndef PRINTF_BUFR_SIZE #define PRINTF_BUFR_SIZE 64 #endif char bufr[PRINTF_BUFR_SIZE]; struct sbuf sb, *sbp __unused; va_list ap; sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN); KASSERT(sbp != NULL, ("sbuf_new misused?")); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_cat(&sb, prefix); g_format_bio(&sb, bp); va_start(ap, fmtsuffix); sbuf_vprintf(&sb, fmtsuffix, ap); va_end(ap); sbuf_nl_terminate(&sb); sbuf_finish(&sb); sbuf_delete(&sb); } void g_format_bio(struct sbuf *sb, const struct bio *bp) { const char *pname, *cmd = NULL; if (bp->bio_to != NULL) pname = bp->bio_to->name; else pname = "[unknown]"; switch (bp->bio_cmd) { case BIO_GETATTR: cmd = "GETATTR"; sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); return; case BIO_FLUSH: cmd = "FLUSH"; sbuf_printf(sb, "%s[%s]", pname, cmd); return; case BIO_ZONE: { char *subcmd = NULL; cmd = "ZONE"; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: subcmd = "OPEN"; break; case DISK_ZONE_CLOSE: subcmd = "CLOSE"; break; case DISK_ZONE_FINISH: subcmd = "FINISH"; break; case DISK_ZONE_RWP: subcmd = "RWP"; break; case DISK_ZONE_REPORT_ZONES: subcmd = "REPORT ZONES"; break; case DISK_ZONE_GET_PARAMS: subcmd = "GET PARAMS"; break; default: subcmd = "UNKNOWN"; break; } sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd); return; } case BIO_READ: cmd = "READ"; break; case BIO_WRITE: cmd = "WRITE"; break; case BIO_DELETE: cmd = "DELETE"; break; default: cmd = "UNKNOWN"; sbuf_printf(sb, "%s[%s()]", pname, cmd); return; } sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); } Index: head/sys/kern/subr_devstat.c =================================================================== --- head/sys/kern/subr_devstat.c (revision 356199) +++ head/sys/kern/subr_devstat.c (revision 356200) @@ -1,587 +1,582 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DEFINE(io); SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *"); SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *"); SDT_PROBE_DEFINE2(io, , , wait__start, "struct bio *", "struct devstat *"); SDT_PROBE_DEFINE2(io, , , wait__done, "struct bio *", "struct devstat *"); #define DTRACE_DEVSTAT_START() SDT_PROBE2(io, , , start, NULL, ds) #define DTRACE_DEVSTAT_BIO_START() SDT_PROBE2(io, , , start, bp, ds) #define DTRACE_DEVSTAT_DONE() SDT_PROBE2(io, , , done, NULL, ds) #define DTRACE_DEVSTAT_BIO_DONE() SDT_PROBE2(io, , , done, bp, ds) #define DTRACE_DEVSTAT_WAIT_START() SDT_PROBE2(io, , , wait__start, NULL, ds) #define DTRACE_DEVSTAT_WAIT_DONE() SDT_PROBE2(io, , , wait__done, NULL, ds) static int devstat_num_devs; static long devstat_generation = 1; static int devstat_version = DEVSTAT_VERSION; static int devstat_current_devnumber; static struct mtx devstat_mutex; MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF); static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq); static struct devstat *devstat_alloc(void); static void devstat_free(struct devstat *); static void devstat_add_entry(struct devstat *ds, const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority); /* * Allocate a devstat and initialize it */ struct devstat * devstat_new_entry(const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority) { struct devstat *ds; mtx_assert(&devstat_mutex, MA_NOTOWNED); ds = devstat_alloc(); mtx_lock(&devstat_mutex); if (unit_number == -1) { ds->unit_number = unit_number; ds->id = dev_name; binuptime(&ds->creation_time); devstat_generation++; } else { devstat_add_entry(ds, dev_name, unit_number, block_size, flags, device_type, priority); } mtx_unlock(&devstat_mutex); return (ds); } /* * Take a malloced and zeroed devstat structure given to us, fill it in * and add it to the queue of devices. */ static void devstat_add_entry(struct devstat *ds, const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority) { struct devstatlist *devstat_head; struct devstat *ds_tmp; mtx_assert(&devstat_mutex, MA_OWNED); devstat_num_devs++; devstat_head = &device_statq; /* * Priority sort. Each driver passes in its priority when it adds * its devstat entry. Drivers are sorted first by priority, and * then by probe order. * * For the first device, we just insert it, since the priority * doesn't really matter yet. Subsequent devices are inserted into * the list using the order outlined above. */ if (devstat_num_devs == 1) STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); else { STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) { struct devstat *ds_next; ds_next = STAILQ_NEXT(ds_tmp, dev_links); /* * If we find a break between higher and lower * priority items, and if this item fits in the * break, insert it. This also applies if the * "lower priority item" is the end of the list. */ if ((priority <= ds_tmp->priority) && ((ds_next == NULL) || (priority > ds_next->priority))) { STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds, dev_links); break; } else if (priority > ds_tmp->priority) { /* * If this is the case, we should be able * to insert ourselves at the head of the * list. If we can't, something is wrong. */ if (ds_tmp == STAILQ_FIRST(devstat_head)) { STAILQ_INSERT_HEAD(devstat_head, ds, dev_links); break; } else { STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); printf("devstat_add_entry: HELP! " "sorting problem detected " "for name %p unit %d\n", dev_name, unit_number); break; } } } } ds->device_number = devstat_current_devnumber++; ds->unit_number = unit_number; strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); ds->block_size = block_size; ds->flags = flags; ds->device_type = device_type; ds->priority = priority; binuptime(&ds->creation_time); devstat_generation++; } /* * Remove a devstat structure from the list of devices. */ void devstat_remove_entry(struct devstat *ds) { struct devstatlist *devstat_head; mtx_assert(&devstat_mutex, MA_NOTOWNED); if (ds == NULL) return; mtx_lock(&devstat_mutex); devstat_head = &device_statq; /* Remove this entry from the devstat queue */ atomic_add_acq_int(&ds->sequence1, 1); if (ds->unit_number != -1) { devstat_num_devs--; STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); } devstat_free(ds); devstat_generation++; mtx_unlock(&devstat_mutex); } /* * Record a transaction start. * * See comments for devstat_end_transaction(). Ordering is very important * here. */ void devstat_start_transaction(struct devstat *ds, const struct bintime *now) { - mtx_assert(&devstat_mutex, MA_NOTOWNED); - /* sanity check */ if (ds == NULL) return; atomic_add_acq_int(&ds->sequence1, 1); /* * We only want to set the start time when we are going from idle * to busy. The start time is really the start of the latest busy * period. */ - if (ds->start_count == ds->end_count) { + if (atomic_fetchadd_int(&ds->start_count, 1) == ds->end_count) { if (now != NULL) ds->busy_from = *now; else binuptime(&ds->busy_from); } - ds->start_count++; atomic_add_rel_int(&ds->sequence0, 1); DTRACE_DEVSTAT_START(); } void devstat_start_transaction_bio(struct devstat *ds, struct bio *bp) { - - mtx_assert(&devstat_mutex, MA_NOTOWNED); /* sanity check */ if (ds == NULL) return; binuptime(&bp->bio_t0); devstat_start_transaction(ds, &bp->bio_t0); DTRACE_DEVSTAT_BIO_START(); } /* * Record the ending of a transaction, and incrment the various counters. * * Ordering in this function, and in devstat_start_transaction() is VERY * important. The idea here is to run without locks, so we are very * careful to only modify some fields on the way "down" (i.e. at * transaction start) and some fields on the way "up" (i.e. at transaction * completion). One exception is busy_from, which we only modify in * devstat_start_transaction() when there are no outstanding transactions, * and thus it can't be modified in devstat_end_transaction() * simultaneously. * * The sequence0 and sequence1 fields are provided to enable an application * spying on the structures with mmap(2) to tell when a structure is in a * consistent state or not. * * For this to work 100% reliably, it is important that the two fields * are at opposite ends of the structure and that they are incremented * in the opposite order of how a memcpy(3) in userland would copy them. * We assume that the copying happens front to back, but there is actually * no way short of writing your own memcpy(3) replacement to guarantee * this will be the case. * * In addition to this, being a kind of locks, they must be updated with * atomic instructions using appropriate memory barriers. */ void devstat_end_transaction(struct devstat *ds, uint32_t bytes, devstat_tag_type tag_type, devstat_trans_flags flags, const struct bintime *now, const struct bintime *then) { struct bintime dt, lnow; /* sanity check */ if (ds == NULL) return; if (now == NULL) { binuptime(&lnow); now = &lnow; } atomic_add_acq_int(&ds->sequence1, 1); /* Update byte and operations counts */ ds->bytes[flags] += bytes; ds->operations[flags]++; /* * Keep a count of the various tag types sent. */ if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 && tag_type != DEVSTAT_TAG_NONE) ds->tag_types[tag_type]++; if (then != NULL) { /* Update duration of operations */ dt = *now; bintime_sub(&dt, then); bintime_add(&ds->duration[flags], &dt); } /* Accumulate busy time */ dt = *now; bintime_sub(&dt, &ds->busy_from); bintime_add(&ds->busy_time, &dt); ds->busy_from = *now; ds->end_count++; atomic_add_rel_int(&ds->sequence0, 1); DTRACE_DEVSTAT_DONE(); } void devstat_end_transaction_bio(struct devstat *ds, const struct bio *bp) { devstat_end_transaction_bio_bt(ds, bp, NULL); } void devstat_end_transaction_bio_bt(struct devstat *ds, const struct bio *bp, const struct bintime *now) { devstat_trans_flags flg; devstat_tag_type tag; /* sanity check */ if (ds == NULL) return; if (bp->bio_flags & BIO_ORDERED) tag = DEVSTAT_TAG_ORDERED; else tag = DEVSTAT_TAG_SIMPLE; if (bp->bio_cmd == BIO_DELETE) flg = DEVSTAT_FREE; else if ((bp->bio_cmd == BIO_READ) || ((bp->bio_cmd == BIO_ZONE) && (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES))) flg = DEVSTAT_READ; else if (bp->bio_cmd == BIO_WRITE) flg = DEVSTAT_WRITE; else flg = DEVSTAT_NO_DATA; devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid, tag, flg, now, &bp->bio_t0); DTRACE_DEVSTAT_BIO_DONE(); } /* * This is the sysctl handler for the devstat package. The data pushed out * on the kern.devstat.all sysctl variable consists of the current devstat * generation number, and then an array of devstat structures, one for each * device in the system. * * This is more cryptic that obvious, but basically we neither can nor * want to hold the devstat_mutex for any amount of time, so we grab it * only when we need to and keep an eye on devstat_generation all the time. */ static int sysctl_devstat(SYSCTL_HANDLER_ARGS) { int error; long mygen; struct devstat *nds; mtx_assert(&devstat_mutex, MA_NOTOWNED); /* * XXX devstat_generation should really be "volatile" but that * XXX freaks out the sysctl macro below. The places where we * XXX change it and inspect it are bracketed in the mutex which * XXX guarantees us proper write barriers. I don't believe the * XXX compiler is allowed to optimize mygen away across calls * XXX to other functions, so the following is belived to be safe. */ mygen = devstat_generation; error = SYSCTL_OUT(req, &mygen, sizeof(mygen)); if (devstat_num_devs == 0) return(0); if (error != 0) return (error); mtx_lock(&devstat_mutex); nds = STAILQ_FIRST(&device_statq); if (mygen != devstat_generation) error = EBUSY; mtx_unlock(&devstat_mutex); if (error != 0) return (error); for (;nds != NULL;) { error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); if (error != 0) return (error); mtx_lock(&devstat_mutex); if (mygen != devstat_generation) error = EBUSY; else nds = STAILQ_NEXT(nds, dev_links); mtx_unlock(&devstat_mutex); if (error != 0) return (error); } return(error); } /* * Sysctl entries for devstat. The first one is a node that all the rest * hang off of. */ static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL, "Device Statistics"); SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE, NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list"); /* * Export the number of devices in the system so that userland utilities * can determine how much memory to allocate to hold all the devices. */ SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, &devstat_num_devs, 0, "Number of devices in the devstat list"); SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, &devstat_generation, 0, "Devstat list generation"); SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, &devstat_version, 0, "Devstat list version number"); /* * Allocator for struct devstat structures. We sub-allocate these from pages * which we get from malloc. These pages are exported for mmap(2)'ing through * a miniature device driver */ #define statsperpage (PAGE_SIZE / sizeof(struct devstat)) static d_mmap_t devstat_mmap; static struct cdevsw devstat_cdevsw = { .d_version = D_VERSION, .d_mmap = devstat_mmap, .d_name = "devstat", }; struct statspage { TAILQ_ENTRY(statspage) list; struct devstat *stat; u_int nfree; }; static TAILQ_HEAD(, statspage) pagelist = TAILQ_HEAD_INITIALIZER(pagelist); static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics"); static int devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { struct statspage *spp; if (nprot != VM_PROT_READ) return (-1); mtx_lock(&devstat_mutex); TAILQ_FOREACH(spp, &pagelist, list) { if (offset == 0) { *paddr = vtophys(spp->stat); mtx_unlock(&devstat_mutex); return (0); } offset -= PAGE_SIZE; } mtx_unlock(&devstat_mutex); return (-1); } static struct devstat * devstat_alloc(void) { struct devstat *dsp; struct statspage *spp, *spp2; u_int u; static int once; mtx_assert(&devstat_mutex, MA_NOTOWNED); if (!once) { make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME, &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0444, DEVSTAT_DEVICE_NAME); once = 1; } spp2 = NULL; mtx_lock(&devstat_mutex); for (;;) { TAILQ_FOREACH(spp, &pagelist, list) { if (spp->nfree > 0) break; } if (spp != NULL) break; mtx_unlock(&devstat_mutex); spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK); spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK); spp2->nfree = statsperpage; /* * If free statspages were added while the lock was released * just reuse them. */ mtx_lock(&devstat_mutex); TAILQ_FOREACH(spp, &pagelist, list) if (spp->nfree > 0) break; if (spp == NULL) { spp = spp2; /* * It would make more sense to add the new page at the * head but the order on the list determine the * sequence of the mapping so we can't do that. */ TAILQ_INSERT_TAIL(&pagelist, spp, list); } else break; } dsp = spp->stat; for (u = 0; u < statsperpage; u++) { if (dsp->allocated == 0) break; dsp++; } spp->nfree--; dsp->allocated = 1; mtx_unlock(&devstat_mutex); if (spp2 != NULL && spp2 != spp) { free(spp2->stat, M_DEVSTAT); free(spp2, M_DEVSTAT); } return (dsp); } static void devstat_free(struct devstat *dsp) { struct statspage *spp; mtx_assert(&devstat_mutex, MA_OWNED); bzero(dsp, sizeof *dsp); TAILQ_FOREACH(spp, &pagelist, list) { if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) { spp->nfree++; return; } } } SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct devstat), "sizeof(struct devstat)");