Index: stable/12/sys/cam/cam_xpt.c =================================================================== --- stable/12/sys/cam/cam_xpt.c (revision 355609) +++ stable/12/sys/cam/cam_xpt.c (revision 355610) @@ -1,5633 +1,5633 @@ /*- * Implementation of the Common Access Method Transport (XPT) layer. * * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, 1998, 1999 Justin T. Gibbs. * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_printf.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* geometry translation */ #include /* for xpt_print below */ #include "opt_cam.h" /* Wild guess based on not wanting to grow the stack too much */ #define XPT_PRINT_MAXLEN 512 #ifdef PRINTF_BUFR_SIZE #define XPT_PRINT_LEN PRINTF_BUFR_SIZE #else #define XPT_PRINT_LEN 128 #endif _Static_assert(XPT_PRINT_LEN <= XPT_PRINT_MAXLEN, "XPT_PRINT_LEN is too large"); /* * This is the maximum number of high powered commands (e.g. start unit) * that can be outstanding at a particular time. */ #ifndef CAM_MAX_HIGHPOWER #define CAM_MAX_HIGHPOWER 4 #endif /* Datastructures internal to the xpt layer */ MALLOC_DEFINE(M_CAMXPT, "CAM XPT", "CAM XPT buffers"); MALLOC_DEFINE(M_CAMDEV, "CAM DEV", "CAM devices"); MALLOC_DEFINE(M_CAMCCB, "CAM CCB", "CAM CCBs"); MALLOC_DEFINE(M_CAMPATH, "CAM path", "CAM paths"); struct xpt_softc { uint32_t xpt_generation; /* number of high powered commands that can go through right now */ struct mtx xpt_highpower_lock; STAILQ_HEAD(highpowerlist, cam_ed) highpowerq; int num_highpower; /* queue for handling async rescan requests. */ TAILQ_HEAD(, ccb_hdr) ccb_scanq; int buses_to_config; int buses_config_done; int announce_nosbuf; /* * Registered buses * * N.B., "busses" is an archaic spelling of "buses". In new code * "buses" is preferred. */ TAILQ_HEAD(,cam_eb) xpt_busses; u_int bus_generation; int boot_delay; struct callout boot_callout; struct task boot_task; struct root_hold_token xpt_rootmount; struct mtx xpt_topo_lock; struct taskqueue *xpt_taskq; }; typedef enum { DM_RET_COPY = 0x01, DM_RET_FLAG_MASK = 0x0f, DM_RET_NONE = 0x00, DM_RET_STOP = 0x10, DM_RET_DESCEND = 0x20, DM_RET_ERROR = 0x30, DM_RET_ACTION_MASK = 0xf0 } dev_match_ret; typedef enum { XPT_DEPTH_BUS, XPT_DEPTH_TARGET, XPT_DEPTH_DEVICE, XPT_DEPTH_PERIPH } xpt_traverse_depth; struct xpt_traverse_config { xpt_traverse_depth depth; void *tr_func; void *tr_arg; }; typedef int xpt_busfunc_t (struct cam_eb *bus, void *arg); typedef int xpt_targetfunc_t (struct cam_et *target, void *arg); typedef int xpt_devicefunc_t (struct cam_ed *device, void *arg); typedef int xpt_periphfunc_t (struct cam_periph *periph, void *arg); typedef int xpt_pdrvfunc_t (struct periph_driver **pdrv, void *arg); /* Transport layer configuration information */ static struct xpt_softc xsoftc; MTX_SYSINIT(xpt_topo_init, &xsoftc.xpt_topo_lock, "XPT topology lock", MTX_DEF); SYSCTL_INT(_kern_cam, OID_AUTO, boot_delay, CTLFLAG_RDTUN, &xsoftc.boot_delay, 0, "Bus registration wait time"); SYSCTL_UINT(_kern_cam, OID_AUTO, xpt_generation, CTLFLAG_RD, &xsoftc.xpt_generation, 0, "CAM peripheral generation count"); SYSCTL_INT(_kern_cam, OID_AUTO, announce_nosbuf, CTLFLAG_RWTUN, &xsoftc.announce_nosbuf, 0, "Don't use sbuf for announcements"); struct cam_doneq { struct mtx_padalign cam_doneq_mtx; STAILQ_HEAD(, ccb_hdr) cam_doneq; int cam_doneq_sleep; }; static struct cam_doneq cam_doneqs[MAXCPU]; static int cam_num_doneqs; static struct proc *cam_proc; SYSCTL_INT(_kern_cam, OID_AUTO, num_doneqs, CTLFLAG_RDTUN, &cam_num_doneqs, 0, "Number of completion queues/threads"); struct cam_periph *xpt_periph; static periph_init_t xpt_periph_init; static struct periph_driver xpt_driver = { xpt_periph_init, "xpt", TAILQ_HEAD_INITIALIZER(xpt_driver.units), /* generation */ 0, CAM_PERIPH_DRV_EARLY }; PERIPHDRIVER_DECLARE(xpt, xpt_driver); static d_open_t xptopen; static d_close_t xptclose; static d_ioctl_t xptioctl; static d_ioctl_t xptdoioctl; static struct cdevsw xpt_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = xptopen, .d_close = xptclose, .d_ioctl = xptioctl, .d_name = "xpt", }; /* Storage for debugging datastructures */ struct cam_path *cam_dpath; -u_int32_t cam_dflags = CAM_DEBUG_FLAGS; +u_int32_t __read_mostly cam_dflags = CAM_DEBUG_FLAGS; SYSCTL_UINT(_kern_cam, OID_AUTO, dflags, CTLFLAG_RWTUN, &cam_dflags, 0, "Enabled debug flags"); u_int32_t cam_debug_delay = CAM_DEBUG_DELAY; SYSCTL_UINT(_kern_cam, OID_AUTO, debug_delay, CTLFLAG_RWTUN, &cam_debug_delay, 0, "Delay in us after each debug message"); /* Our boot-time initialization hook */ static int cam_module_event_handler(module_t, int /*modeventtype_t*/, void *); static moduledata_t cam_moduledata = { "cam", cam_module_event_handler, NULL }; static int xpt_init(void *); DECLARE_MODULE(cam, cam_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND); MODULE_VERSION(cam, 1); static void xpt_async_bcast(struct async_list *async_head, u_int32_t async_code, struct cam_path *path, void *async_arg); static path_id_t xptnextfreepathid(void); static path_id_t xptpathid(const char *sim_name, int sim_unit, int sim_bus); static union ccb *xpt_get_ccb(struct cam_periph *periph); static union ccb *xpt_get_ccb_nowait(struct cam_periph *periph); static void xpt_run_allocq(struct cam_periph *periph, int sleep); static void xpt_run_allocq_task(void *context, int pending); static void xpt_run_devq(struct cam_devq *devq); static timeout_t xpt_release_devq_timeout; static void xpt_release_simq_timeout(void *arg) __unused; static void xpt_acquire_bus(struct cam_eb *bus); static void xpt_release_bus(struct cam_eb *bus); static uint32_t xpt_freeze_devq_device(struct cam_ed *dev, u_int count); static int xpt_release_devq_device(struct cam_ed *dev, u_int count, int run_queue); static struct cam_et* xpt_alloc_target(struct cam_eb *bus, target_id_t target_id); static void xpt_acquire_target(struct cam_et *target); static void xpt_release_target(struct cam_et *target); static struct cam_eb* xpt_find_bus(path_id_t path_id); static struct cam_et* xpt_find_target(struct cam_eb *bus, target_id_t target_id); static struct cam_ed* xpt_find_device(struct cam_et *target, lun_id_t lun_id); static void xpt_config(void *arg); static void xpt_hold_boot_locked(void); static int xpt_schedule_dev(struct camq *queue, cam_pinfo *dev_pinfo, u_int32_t new_priority); static xpt_devicefunc_t xptpassannouncefunc; static void xptaction(struct cam_sim *sim, union ccb *work_ccb); static void xptpoll(struct cam_sim *sim); static void camisr_runqueue(void); static void xpt_done_process(struct ccb_hdr *ccb_h); static void xpt_done_td(void *); static dev_match_ret xptbusmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_eb *bus); static dev_match_ret xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_ed *device); static dev_match_ret xptperiphmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_periph *periph); static xpt_busfunc_t xptedtbusfunc; static xpt_targetfunc_t xptedttargetfunc; static xpt_devicefunc_t xptedtdevicefunc; static xpt_periphfunc_t xptedtperiphfunc; static xpt_pdrvfunc_t xptplistpdrvfunc; static xpt_periphfunc_t xptplistperiphfunc; static int xptedtmatch(struct ccb_dev_match *cdm); static int xptperiphlistmatch(struct ccb_dev_match *cdm); static int xptbustraverse(struct cam_eb *start_bus, xpt_busfunc_t *tr_func, void *arg); static int xpttargettraverse(struct cam_eb *bus, struct cam_et *start_target, xpt_targetfunc_t *tr_func, void *arg); static int xptdevicetraverse(struct cam_et *target, struct cam_ed *start_device, xpt_devicefunc_t *tr_func, void *arg); static int xptperiphtraverse(struct cam_ed *device, struct cam_periph *start_periph, xpt_periphfunc_t *tr_func, void *arg); static int xptpdrvtraverse(struct periph_driver **start_pdrv, xpt_pdrvfunc_t *tr_func, void *arg); static int xptpdperiphtraverse(struct periph_driver **pdrv, struct cam_periph *start_periph, xpt_periphfunc_t *tr_func, void *arg); static xpt_busfunc_t xptdefbusfunc; static xpt_targetfunc_t xptdeftargetfunc; static xpt_devicefunc_t xptdefdevicefunc; static xpt_periphfunc_t xptdefperiphfunc; static void xpt_finishconfig_task(void *context, int pending); static void xpt_dev_async_default(u_int32_t async_code, struct cam_eb *bus, struct cam_et *target, struct cam_ed *device, void *async_arg); static struct cam_ed * xpt_alloc_device_default(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id); static xpt_devicefunc_t xptsetasyncfunc; static xpt_busfunc_t xptsetasyncbusfunc; static cam_status xptregister(struct cam_periph *periph, void *arg); static __inline int device_is_queued(struct cam_ed *device); static __inline int xpt_schedule_devq(struct cam_devq *devq, struct cam_ed *dev) { int retval; mtx_assert(&devq->send_mtx, MA_OWNED); if ((dev->ccbq.queue.entries > 0) && (dev->ccbq.dev_openings > 0) && (dev->ccbq.queue.qfrozen_cnt == 0)) { /* * The priority of a device waiting for controller * resources is that of the highest priority CCB * enqueued. */ retval = xpt_schedule_dev(&devq->send_queue, &dev->devq_entry, CAMQ_GET_PRIO(&dev->ccbq.queue)); } else { retval = 0; } return (retval); } static __inline int device_is_queued(struct cam_ed *device) { return (device->devq_entry.index != CAM_UNQUEUED_INDEX); } static void xpt_periph_init() { make_dev(&xpt_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "xpt0"); } static int xptopen(struct cdev *dev, int flags, int fmt, struct thread *td) { /* * Only allow read-write access. */ if (((flags & FWRITE) == 0) || ((flags & FREAD) == 0)) return(EPERM); /* * We don't allow nonblocking access. */ if ((flags & O_NONBLOCK) != 0) { printf("%s: can't do nonblocking access\n", devtoname(dev)); return(ENODEV); } return(0); } static int xptclose(struct cdev *dev, int flag, int fmt, struct thread *td) { return(0); } /* * Don't automatically grab the xpt softc lock here even though this is going * through the xpt device. The xpt device is really just a back door for * accessing other devices and SIMs, so the right thing to do is to grab * the appropriate SIM lock once the bus/SIM is located. */ static int xptioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { int error; if ((error = xptdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) { error = cam_compat_ioctl(dev, cmd, addr, flag, td, xptdoioctl); } return (error); } static int xptdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { int error; error = 0; switch(cmd) { /* * For the transport layer CAMIOCOMMAND ioctl, we really only want * to accept CCB types that don't quite make sense to send through a * passthrough driver. XPT_PATH_INQ is an exception to this, as stated * in the CAM spec. */ case CAMIOCOMMAND: { union ccb *ccb; union ccb *inccb; struct cam_eb *bus; inccb = (union ccb *)addr; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) if (inccb->ccb_h.func_code == XPT_SCSI_IO) inccb->csio.bio = NULL; #endif if (inccb->ccb_h.flags & CAM_UNLOCKED) return (EINVAL); bus = xpt_find_bus(inccb->ccb_h.path_id); if (bus == NULL) return (EINVAL); switch (inccb->ccb_h.func_code) { case XPT_SCAN_BUS: case XPT_RESET_BUS: if (inccb->ccb_h.target_id != CAM_TARGET_WILDCARD || inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) { xpt_release_bus(bus); return (EINVAL); } break; case XPT_SCAN_TGT: if (inccb->ccb_h.target_id == CAM_TARGET_WILDCARD || inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) { xpt_release_bus(bus); return (EINVAL); } break; default: break; } switch(inccb->ccb_h.func_code) { case XPT_SCAN_BUS: case XPT_RESET_BUS: case XPT_PATH_INQ: case XPT_ENG_INQ: case XPT_SCAN_LUN: case XPT_SCAN_TGT: ccb = xpt_alloc_ccb(); /* * Create a path using the bus, target, and lun the * user passed in. */ if (xpt_create_path(&ccb->ccb_h.path, NULL, inccb->ccb_h.path_id, inccb->ccb_h.target_id, inccb->ccb_h.target_lun) != CAM_REQ_CMP){ error = EINVAL; xpt_free_ccb(ccb); break; } /* Ensure all of our fields are correct */ xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, inccb->ccb_h.pinfo.priority); xpt_merge_ccb(ccb, inccb); xpt_path_lock(ccb->ccb_h.path); cam_periph_runccb(ccb, NULL, 0, 0, NULL); xpt_path_unlock(ccb->ccb_h.path); bcopy(ccb, inccb, sizeof(union ccb)); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); break; case XPT_DEBUG: { union ccb ccb; /* * This is an immediate CCB, so it's okay to * allocate it on the stack. */ /* * Create a path using the bus, target, and lun the * user passed in. */ if (xpt_create_path(&ccb.ccb_h.path, NULL, inccb->ccb_h.path_id, inccb->ccb_h.target_id, inccb->ccb_h.target_lun) != CAM_REQ_CMP){ error = EINVAL; break; } /* Ensure all of our fields are correct */ xpt_setup_ccb(&ccb.ccb_h, ccb.ccb_h.path, inccb->ccb_h.pinfo.priority); xpt_merge_ccb(&ccb, inccb); xpt_action(&ccb); bcopy(&ccb, inccb, sizeof(union ccb)); xpt_free_path(ccb.ccb_h.path); break; } case XPT_DEV_MATCH: { struct cam_periph_map_info mapinfo; struct cam_path *old_path; /* * We can't deal with physical addresses for this * type of transaction. */ if ((inccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) { error = EINVAL; break; } /* * Save this in case the caller had it set to * something in particular. */ old_path = inccb->ccb_h.path; /* * We really don't need a path for the matching * code. The path is needed because of the * debugging statements in xpt_action(). They * assume that the CCB has a valid path. */ inccb->ccb_h.path = xpt_periph->path; bzero(&mapinfo, sizeof(mapinfo)); /* * Map the pattern and match buffers into kernel * virtual address space. */ error = cam_periph_mapmem(inccb, &mapinfo, MAXPHYS); if (error) { inccb->ccb_h.path = old_path; break; } /* * This is an immediate CCB, we can send it on directly. */ xpt_action(inccb); /* * Map the buffers back into user space. */ cam_periph_unmapmem(inccb, &mapinfo); inccb->ccb_h.path = old_path; error = 0; break; } default: error = ENOTSUP; break; } xpt_release_bus(bus); break; } /* * This is the getpassthru ioctl. It takes a XPT_GDEVLIST ccb as input, * with the periphal driver name and unit name filled in. The other * fields don't really matter as input. The passthrough driver name * ("pass"), and unit number are passed back in the ccb. The current * device generation number, and the index into the device peripheral * driver list, and the status are also passed back. Note that * since we do everything in one pass, unlike the XPT_GDEVLIST ccb, * we never return a status of CAM_GDEVLIST_LIST_CHANGED. It is * (or rather should be) impossible for the device peripheral driver * list to change since we look at the whole thing in one pass, and * we do it with lock protection. * */ case CAMGETPASSTHRU: { union ccb *ccb; struct cam_periph *periph; struct periph_driver **p_drv; char *name; u_int unit; int base_periph_found; ccb = (union ccb *)addr; unit = ccb->cgdl.unit_number; name = ccb->cgdl.periph_name; base_periph_found = 0; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) if (ccb->ccb_h.func_code == XPT_SCSI_IO) ccb->csio.bio = NULL; #endif /* * Sanity check -- make sure we don't get a null peripheral * driver name. */ if (*ccb->cgdl.periph_name == '\0') { error = EINVAL; break; } /* Keep the list from changing while we traverse it */ xpt_lock_buses(); /* first find our driver in the list of drivers */ for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) if (strcmp((*p_drv)->driver_name, name) == 0) break; if (*p_drv == NULL) { xpt_unlock_buses(); ccb->ccb_h.status = CAM_REQ_CMP_ERR; ccb->cgdl.status = CAM_GDEVLIST_ERROR; *ccb->cgdl.periph_name = '\0'; ccb->cgdl.unit_number = 0; error = ENOENT; break; } /* * Run through every peripheral instance of this driver * and check to see whether it matches the unit passed * in by the user. If it does, get out of the loops and * find the passthrough driver associated with that * peripheral driver. */ for (periph = TAILQ_FIRST(&(*p_drv)->units); periph != NULL; periph = TAILQ_NEXT(periph, unit_links)) { if (periph->unit_number == unit) break; } /* * If we found the peripheral driver that the user passed * in, go through all of the peripheral drivers for that * particular device and look for a passthrough driver. */ if (periph != NULL) { struct cam_ed *device; int i; base_periph_found = 1; device = periph->path->device; for (i = 0, periph = SLIST_FIRST(&device->periphs); periph != NULL; periph = SLIST_NEXT(periph, periph_links), i++) { /* * Check to see whether we have a * passthrough device or not. */ if (strcmp(periph->periph_name, "pass") == 0) { /* * Fill in the getdevlist fields. */ strlcpy(ccb->cgdl.periph_name, periph->periph_name, sizeof(ccb->cgdl.periph_name)); ccb->cgdl.unit_number = periph->unit_number; if (SLIST_NEXT(periph, periph_links)) ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS; else ccb->cgdl.status = CAM_GDEVLIST_LAST_DEVICE; ccb->cgdl.generation = device->generation; ccb->cgdl.index = i; /* * Fill in some CCB header fields * that the user may want. */ ccb->ccb_h.path_id = periph->path->bus->path_id; ccb->ccb_h.target_id = periph->path->target->target_id; ccb->ccb_h.target_lun = periph->path->device->lun_id; ccb->ccb_h.status = CAM_REQ_CMP; break; } } } /* * If the periph is null here, one of two things has * happened. The first possibility is that we couldn't * find the unit number of the particular peripheral driver * that the user is asking about. e.g. the user asks for * the passthrough driver for "da11". We find the list of * "da" peripherals all right, but there is no unit 11. * The other possibility is that we went through the list * of peripheral drivers attached to the device structure, * but didn't find one with the name "pass". Either way, * we return ENOENT, since we couldn't find something. */ if (periph == NULL) { ccb->ccb_h.status = CAM_REQ_CMP_ERR; ccb->cgdl.status = CAM_GDEVLIST_ERROR; *ccb->cgdl.periph_name = '\0'; ccb->cgdl.unit_number = 0; error = ENOENT; /* * It is unfortunate that this is even necessary, * but there are many, many clueless users out there. * If this is true, the user is looking for the * passthrough driver, but doesn't have one in his * kernel. */ if (base_periph_found == 1) { printf("xptioctl: pass driver is not in the " "kernel\n"); printf("xptioctl: put \"device pass\" in " "your kernel config file\n"); } } xpt_unlock_buses(); break; } default: error = ENOTTY; break; } return(error); } static int cam_module_event_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: if ((error = xpt_init(NULL)) != 0) return (error); break; case MOD_UNLOAD: return EBUSY; default: return EOPNOTSUPP; } return 0; } static struct xpt_proto * xpt_proto_find(cam_proto proto) { struct xpt_proto **pp; SET_FOREACH(pp, cam_xpt_proto_set) { if ((*pp)->proto == proto) return *pp; } return NULL; } static void xpt_rescan_done(struct cam_periph *periph, union ccb *done_ccb) { if (done_ccb->ccb_h.ppriv_ptr1 == NULL) { xpt_free_path(done_ccb->ccb_h.path); xpt_free_ccb(done_ccb); } else { done_ccb->ccb_h.cbfcnp = done_ccb->ccb_h.ppriv_ptr1; (*done_ccb->ccb_h.cbfcnp)(periph, done_ccb); } xpt_release_boot(); } /* thread to handle bus rescans */ static void xpt_scanner_thread(void *dummy) { union ccb *ccb; struct cam_path path; xpt_lock_buses(); for (;;) { if (TAILQ_EMPTY(&xsoftc.ccb_scanq)) msleep(&xsoftc.ccb_scanq, &xsoftc.xpt_topo_lock, PRIBIO, "-", 0); if ((ccb = (union ccb *)TAILQ_FIRST(&xsoftc.ccb_scanq)) != NULL) { TAILQ_REMOVE(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe); xpt_unlock_buses(); /* * Since lock can be dropped inside and path freed * by completion callback even before return here, * take our own path copy for reference. */ xpt_copy_path(&path, ccb->ccb_h.path); xpt_path_lock(&path); xpt_action(ccb); xpt_path_unlock(&path); xpt_release_path(&path); xpt_lock_buses(); } } } void xpt_rescan(union ccb *ccb) { struct ccb_hdr *hdr; /* Prepare request */ if (ccb->ccb_h.path->target->target_id == CAM_TARGET_WILDCARD && ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_BUS; else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD && ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_TGT; else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD && ccb->ccb_h.path->device->lun_id != CAM_LUN_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_LUN; else { xpt_print(ccb->ccb_h.path, "illegal scan path\n"); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); return; } CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_rescan: func %#x %s\n", ccb->ccb_h.func_code, xpt_action_name(ccb->ccb_h.func_code))); ccb->ccb_h.ppriv_ptr1 = ccb->ccb_h.cbfcnp; ccb->ccb_h.cbfcnp = xpt_rescan_done; xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_XPT); /* Don't make duplicate entries for the same paths. */ xpt_lock_buses(); if (ccb->ccb_h.ppriv_ptr1 == NULL) { TAILQ_FOREACH(hdr, &xsoftc.ccb_scanq, sim_links.tqe) { if (xpt_path_comp(hdr->path, ccb->ccb_h.path) == 0) { wakeup(&xsoftc.ccb_scanq); xpt_unlock_buses(); xpt_print(ccb->ccb_h.path, "rescan already queued\n"); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); return; } } } TAILQ_INSERT_TAIL(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe); xpt_hold_boot_locked(); wakeup(&xsoftc.ccb_scanq); xpt_unlock_buses(); } /* Functions accessed by the peripheral drivers */ static int xpt_init(void *dummy) { struct cam_sim *xpt_sim; struct cam_path *path; struct cam_devq *devq; cam_status status; int error, i; TAILQ_INIT(&xsoftc.xpt_busses); TAILQ_INIT(&xsoftc.ccb_scanq); STAILQ_INIT(&xsoftc.highpowerq); xsoftc.num_highpower = CAM_MAX_HIGHPOWER; mtx_init(&xsoftc.xpt_highpower_lock, "XPT highpower lock", NULL, MTX_DEF); xsoftc.xpt_taskq = taskqueue_create("CAM XPT task", M_WAITOK, taskqueue_thread_enqueue, /*context*/&xsoftc.xpt_taskq); #ifdef CAM_BOOT_DELAY /* * Override this value at compile time to assist our users * who don't use loader to boot a kernel. */ xsoftc.boot_delay = CAM_BOOT_DELAY; #endif /* * The xpt layer is, itself, the equivalent of a SIM. * Allow 16 ccbs in the ccb pool for it. This should * give decent parallelism when we probe buses and * perform other XPT functions. */ devq = cam_simq_alloc(16); xpt_sim = cam_sim_alloc(xptaction, xptpoll, "xpt", /*softc*/NULL, /*unit*/0, /*mtx*/NULL, /*max_dev_transactions*/0, /*max_tagged_dev_transactions*/0, devq); if (xpt_sim == NULL) return (ENOMEM); if ((status = xpt_bus_register(xpt_sim, NULL, 0)) != CAM_SUCCESS) { printf("xpt_init: xpt_bus_register failed with status %#x," " failing attach\n", status); return (EINVAL); } /* * Looking at the XPT from the SIM layer, the XPT is * the equivalent of a peripheral driver. Allocate * a peripheral driver entry for us. */ if ((status = xpt_create_path(&path, NULL, CAM_XPT_PATH_ID, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD)) != CAM_REQ_CMP) { printf("xpt_init: xpt_create_path failed with status %#x," " failing attach\n", status); return (EINVAL); } xpt_path_lock(path); cam_periph_alloc(xptregister, NULL, NULL, NULL, "xpt", CAM_PERIPH_BIO, path, NULL, 0, xpt_sim); xpt_path_unlock(path); xpt_free_path(path); if (cam_num_doneqs < 1) cam_num_doneqs = 1 + mp_ncpus / 6; else if (cam_num_doneqs > MAXCPU) cam_num_doneqs = MAXCPU; for (i = 0; i < cam_num_doneqs; i++) { mtx_init(&cam_doneqs[i].cam_doneq_mtx, "CAM doneq", NULL, MTX_DEF); STAILQ_INIT(&cam_doneqs[i].cam_doneq); error = kproc_kthread_add(xpt_done_td, &cam_doneqs[i], &cam_proc, NULL, 0, 0, "cam", "doneq%d", i); if (error != 0) { cam_num_doneqs = i; break; } } if (cam_num_doneqs < 1) { printf("xpt_init: Cannot init completion queues " "- failing attach\n"); return (ENOMEM); } /* * Register a callback for when interrupts are enabled. */ config_intrhook_oneshot(xpt_config, NULL); return (0); } static cam_status xptregister(struct cam_periph *periph, void *arg) { struct cam_sim *xpt_sim; if (periph == NULL) { printf("xptregister: periph was NULL!!\n"); return(CAM_REQ_CMP_ERR); } xpt_sim = (struct cam_sim *)arg; xpt_sim->softc = periph; xpt_periph = periph; periph->softc = NULL; return(CAM_REQ_CMP); } int32_t xpt_add_periph(struct cam_periph *periph) { struct cam_ed *device; int32_t status; TASK_INIT(&periph->periph_run_task, 0, xpt_run_allocq_task, periph); device = periph->path->device; status = CAM_REQ_CMP; if (device != NULL) { mtx_lock(&device->target->bus->eb_mtx); device->generation++; SLIST_INSERT_HEAD(&device->periphs, periph, periph_links); mtx_unlock(&device->target->bus->eb_mtx); atomic_add_32(&xsoftc.xpt_generation, 1); } return (status); } void xpt_remove_periph(struct cam_periph *periph) { struct cam_ed *device; device = periph->path->device; if (device != NULL) { mtx_lock(&device->target->bus->eb_mtx); device->generation++; SLIST_REMOVE(&device->periphs, periph, cam_periph, periph_links); mtx_unlock(&device->target->bus->eb_mtx); atomic_add_32(&xsoftc.xpt_generation, 1); } } void xpt_announce_periph(struct cam_periph *periph, char *announce_string) { struct cam_path *path = periph->path; struct xpt_proto *proto; cam_periph_assert(periph, MA_OWNED); periph->flags |= CAM_PERIPH_ANNOUNCED; printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n", periph->periph_name, periph->unit_number, path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id, path->bus->path_id, path->target->target_id, (uintmax_t)path->device->lun_id); printf("%s%d: ", periph->periph_name, periph->unit_number); proto = xpt_proto_find(path->device->protocol); if (proto) proto->ops->announce(path->device); else printf("%s%d: Unknown protocol device %d\n", periph->periph_name, periph->unit_number, path->device->protocol); if (path->device->serial_num_len > 0) { /* Don't wrap the screen - print only the first 60 chars */ printf("%s%d: Serial Number %.60s\n", periph->periph_name, periph->unit_number, path->device->serial_num); } /* Announce transport details. */ path->bus->xport->ops->announce(periph); /* Announce command queueing. */ if (path->device->inq_flags & SID_CmdQue || path->device->flags & CAM_DEV_TAG_AFTER_COUNT) { printf("%s%d: Command Queueing enabled\n", periph->periph_name, periph->unit_number); } /* Announce caller's details if they've passed in. */ if (announce_string != NULL) printf("%s%d: %s\n", periph->periph_name, periph->unit_number, announce_string); } void xpt_announce_periph_sbuf(struct cam_periph *periph, struct sbuf *sb, char *announce_string) { struct cam_path *path = periph->path; struct xpt_proto *proto; cam_periph_assert(periph, MA_OWNED); periph->flags |= CAM_PERIPH_ANNOUNCED; /* Fall back to the non-sbuf method if necessary */ if (xsoftc.announce_nosbuf != 0) { xpt_announce_periph(periph, announce_string); return; } proto = xpt_proto_find(path->device->protocol); if (((proto != NULL) && (proto->ops->announce_sbuf == NULL)) || (path->bus->xport->ops->announce_sbuf == NULL)) { xpt_announce_periph(periph, announce_string); return; } sbuf_printf(sb, "%s%d at %s%d bus %d scbus%d target %d lun %jx\n", periph->periph_name, periph->unit_number, path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id, path->bus->path_id, path->target->target_id, (uintmax_t)path->device->lun_id); sbuf_printf(sb, "%s%d: ", periph->periph_name, periph->unit_number); if (proto) proto->ops->announce_sbuf(path->device, sb); else sbuf_printf(sb, "%s%d: Unknown protocol device %d\n", periph->periph_name, periph->unit_number, path->device->protocol); if (path->device->serial_num_len > 0) { /* Don't wrap the screen - print only the first 60 chars */ sbuf_printf(sb, "%s%d: Serial Number %.60s\n", periph->periph_name, periph->unit_number, path->device->serial_num); } /* Announce transport details. */ path->bus->xport->ops->announce_sbuf(periph, sb); /* Announce command queueing. */ if (path->device->inq_flags & SID_CmdQue || path->device->flags & CAM_DEV_TAG_AFTER_COUNT) { sbuf_printf(sb, "%s%d: Command Queueing enabled\n", periph->periph_name, periph->unit_number); } /* Announce caller's details if they've passed in. */ if (announce_string != NULL) sbuf_printf(sb, "%s%d: %s\n", periph->periph_name, periph->unit_number, announce_string); } void xpt_announce_quirks(struct cam_periph *periph, int quirks, char *bit_string) { if (quirks != 0) { printf("%s%d: quirks=0x%b\n", periph->periph_name, periph->unit_number, quirks, bit_string); } } void xpt_announce_quirks_sbuf(struct cam_periph *periph, struct sbuf *sb, int quirks, char *bit_string) { if (xsoftc.announce_nosbuf != 0) { xpt_announce_quirks(periph, quirks, bit_string); return; } if (quirks != 0) { sbuf_printf(sb, "%s%d: quirks=0x%b\n", periph->periph_name, periph->unit_number, quirks, bit_string); } } void xpt_denounce_periph(struct cam_periph *periph) { struct cam_path *path = periph->path; struct xpt_proto *proto; cam_periph_assert(periph, MA_OWNED); printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n", periph->periph_name, periph->unit_number, path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id, path->bus->path_id, path->target->target_id, (uintmax_t)path->device->lun_id); printf("%s%d: ", periph->periph_name, periph->unit_number); proto = xpt_proto_find(path->device->protocol); if (proto) proto->ops->denounce(path->device); else printf("%s%d: Unknown protocol device %d\n", periph->periph_name, periph->unit_number, path->device->protocol); if (path->device->serial_num_len > 0) printf(" s/n %.60s", path->device->serial_num); printf(" detached\n"); } void xpt_denounce_periph_sbuf(struct cam_periph *periph, struct sbuf *sb) { struct cam_path *path = periph->path; struct xpt_proto *proto; cam_periph_assert(periph, MA_OWNED); /* Fall back to the non-sbuf method if necessary */ if (xsoftc.announce_nosbuf != 0) { xpt_denounce_periph(periph); return; } proto = xpt_proto_find(path->device->protocol); if ((proto != NULL) && (proto->ops->denounce_sbuf == NULL)) { xpt_denounce_periph(periph); return; } sbuf_printf(sb, "%s%d at %s%d bus %d scbus%d target %d lun %jx\n", periph->periph_name, periph->unit_number, path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id, path->bus->path_id, path->target->target_id, (uintmax_t)path->device->lun_id); sbuf_printf(sb, "%s%d: ", periph->periph_name, periph->unit_number); if (proto) proto->ops->denounce_sbuf(path->device, sb); else sbuf_printf(sb, "%s%d: Unknown protocol device %d\n", periph->periph_name, periph->unit_number, path->device->protocol); if (path->device->serial_num_len > 0) sbuf_printf(sb, " s/n %.60s", path->device->serial_num); sbuf_printf(sb, " detached\n"); } int xpt_getattr(char *buf, size_t len, const char *attr, struct cam_path *path) { int ret = -1, l, o; struct ccb_dev_advinfo cdai; struct scsi_vpd_device_id *did; struct scsi_vpd_id_descriptor *idd; xpt_path_assert(path, MA_OWNED); memset(&cdai, 0, sizeof(cdai)); xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL); cdai.ccb_h.func_code = XPT_DEV_ADVINFO; cdai.flags = CDAI_FLAG_NONE; cdai.bufsiz = len; cdai.buf = buf; if (!strcmp(attr, "GEOM::ident")) cdai.buftype = CDAI_TYPE_SERIAL_NUM; else if (!strcmp(attr, "GEOM::physpath")) cdai.buftype = CDAI_TYPE_PHYS_PATH; else if (strcmp(attr, "GEOM::lunid") == 0 || strcmp(attr, "GEOM::lunname") == 0) { cdai.buftype = CDAI_TYPE_SCSI_DEVID; cdai.bufsiz = CAM_SCSI_DEVID_MAXLEN; cdai.buf = malloc(cdai.bufsiz, M_CAMXPT, M_NOWAIT); if (cdai.buf == NULL) { ret = ENOMEM; goto out; } } else goto out; xpt_action((union ccb *)&cdai); /* can only be synchronous */ if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE); if (cdai.provsiz == 0) goto out; switch(cdai.buftype) { case CDAI_TYPE_SCSI_DEVID: did = (struct scsi_vpd_device_id *)cdai.buf; if (strcmp(attr, "GEOM::lunid") == 0) { idd = scsi_get_devid(did, cdai.provsiz, scsi_devid_is_lun_naa); if (idd == NULL) idd = scsi_get_devid(did, cdai.provsiz, scsi_devid_is_lun_eui64); if (idd == NULL) idd = scsi_get_devid(did, cdai.provsiz, scsi_devid_is_lun_uuid); if (idd == NULL) idd = scsi_get_devid(did, cdai.provsiz, scsi_devid_is_lun_md5); } else idd = NULL; if (idd == NULL) idd = scsi_get_devid(did, cdai.provsiz, scsi_devid_is_lun_t10); if (idd == NULL) idd = scsi_get_devid(did, cdai.provsiz, scsi_devid_is_lun_name); if (idd == NULL) break; ret = 0; if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) == SVPD_ID_CODESET_ASCII) { if (idd->length < len) { for (l = 0; l < idd->length; l++) buf[l] = idd->identifier[l] ? idd->identifier[l] : ' '; buf[l] = 0; } else ret = EFAULT; break; } if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) == SVPD_ID_CODESET_UTF8) { l = strnlen(idd->identifier, idd->length); if (l < len) { bcopy(idd->identifier, buf, l); buf[l] = 0; } else ret = EFAULT; break; } if ((idd->id_type & SVPD_ID_TYPE_MASK) == SVPD_ID_TYPE_UUID && idd->identifier[0] == 0x10) { if ((idd->length - 2) * 2 + 4 >= len) { ret = EFAULT; break; } for (l = 2, o = 0; l < idd->length; l++) { if (l == 6 || l == 8 || l == 10 || l == 12) o += sprintf(buf + o, "-"); o += sprintf(buf + o, "%02x", idd->identifier[l]); } break; } if (idd->length * 2 < len) { for (l = 0; l < idd->length; l++) sprintf(buf + l * 2, "%02x", idd->identifier[l]); } else ret = EFAULT; break; default: if (cdai.provsiz < len) { cdai.buf[cdai.provsiz] = 0; ret = 0; } else ret = EFAULT; break; } out: if ((char *)cdai.buf != buf) free(cdai.buf, M_CAMXPT); return ret; } static dev_match_ret xptbusmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_eb *bus) { dev_match_ret retval; u_int i; retval = DM_RET_NONE; /* * If we aren't given something to match against, that's an error. */ if (bus == NULL) return(DM_RET_ERROR); /* * If there are no match entries, then this bus matches no * matter what. */ if ((patterns == NULL) || (num_patterns == 0)) return(DM_RET_DESCEND | DM_RET_COPY); for (i = 0; i < num_patterns; i++) { struct bus_match_pattern *cur_pattern; /* * If the pattern in question isn't for a bus node, we * aren't interested. However, we do indicate to the * calling routine that we should continue descending the * tree, since the user wants to match against lower-level * EDT elements. */ if (patterns[i].type != DEV_MATCH_BUS) { if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE) retval |= DM_RET_DESCEND; continue; } cur_pattern = &patterns[i].pattern.bus_pattern; /* * If they want to match any bus node, we give them any * device node. */ if (cur_pattern->flags == BUS_MATCH_ANY) { /* set the copy flag */ retval |= DM_RET_COPY; /* * If we've already decided on an action, go ahead * and return. */ if ((retval & DM_RET_ACTION_MASK) != DM_RET_NONE) return(retval); } /* * Not sure why someone would do this... */ if (cur_pattern->flags == BUS_MATCH_NONE) continue; if (((cur_pattern->flags & BUS_MATCH_PATH) != 0) && (cur_pattern->path_id != bus->path_id)) continue; if (((cur_pattern->flags & BUS_MATCH_BUS_ID) != 0) && (cur_pattern->bus_id != bus->sim->bus_id)) continue; if (((cur_pattern->flags & BUS_MATCH_UNIT) != 0) && (cur_pattern->unit_number != bus->sim->unit_number)) continue; if (((cur_pattern->flags & BUS_MATCH_NAME) != 0) && (strncmp(cur_pattern->dev_name, bus->sim->sim_name, DEV_IDLEN) != 0)) continue; /* * If we get to this point, the user definitely wants * information on this bus. So tell the caller to copy the * data out. */ retval |= DM_RET_COPY; /* * If the return action has been set to descend, then we * know that we've already seen a non-bus matching * expression, therefore we need to further descend the tree. * This won't change by continuing around the loop, so we * go ahead and return. If we haven't seen a non-bus * matching expression, we keep going around the loop until * we exhaust the matching expressions. We'll set the stop * flag once we fall out of the loop. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND) return(retval); } /* * If the return action hasn't been set to descend yet, that means * we haven't seen anything other than bus matching patterns. So * tell the caller to stop descending the tree -- the user doesn't * want to match against lower level tree elements. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE) retval |= DM_RET_STOP; return(retval); } static dev_match_ret xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_ed *device) { dev_match_ret retval; u_int i; retval = DM_RET_NONE; /* * If we aren't given something to match against, that's an error. */ if (device == NULL) return(DM_RET_ERROR); /* * If there are no match entries, then this device matches no * matter what. */ if ((patterns == NULL) || (num_patterns == 0)) return(DM_RET_DESCEND | DM_RET_COPY); for (i = 0; i < num_patterns; i++) { struct device_match_pattern *cur_pattern; struct scsi_vpd_device_id *device_id_page; /* * If the pattern in question isn't for a device node, we * aren't interested. */ if (patterns[i].type != DEV_MATCH_DEVICE) { if ((patterns[i].type == DEV_MATCH_PERIPH) && ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)) retval |= DM_RET_DESCEND; continue; } cur_pattern = &patterns[i].pattern.device_pattern; /* Error out if mutually exclusive options are specified. */ if ((cur_pattern->flags & (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID)) == (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID)) return(DM_RET_ERROR); /* * If they want to match any device node, we give them any * device node. */ if (cur_pattern->flags == DEV_MATCH_ANY) goto copy_dev_node; /* * Not sure why someone would do this... */ if (cur_pattern->flags == DEV_MATCH_NONE) continue; if (((cur_pattern->flags & DEV_MATCH_PATH) != 0) && (cur_pattern->path_id != device->target->bus->path_id)) continue; if (((cur_pattern->flags & DEV_MATCH_TARGET) != 0) && (cur_pattern->target_id != device->target->target_id)) continue; if (((cur_pattern->flags & DEV_MATCH_LUN) != 0) && (cur_pattern->target_lun != device->lun_id)) continue; if (((cur_pattern->flags & DEV_MATCH_INQUIRY) != 0) && (cam_quirkmatch((caddr_t)&device->inq_data, (caddr_t)&cur_pattern->data.inq_pat, 1, sizeof(cur_pattern->data.inq_pat), scsi_static_inquiry_match) == NULL)) continue; device_id_page = (struct scsi_vpd_device_id *)device->device_id; if (((cur_pattern->flags & DEV_MATCH_DEVID) != 0) && (device->device_id_len < SVPD_DEVICE_ID_HDR_LEN || scsi_devid_match((uint8_t *)device_id_page->desc_list, device->device_id_len - SVPD_DEVICE_ID_HDR_LEN, cur_pattern->data.devid_pat.id, cur_pattern->data.devid_pat.id_len) != 0)) continue; copy_dev_node: /* * If we get to this point, the user definitely wants * information on this device. So tell the caller to copy * the data out. */ retval |= DM_RET_COPY; /* * If the return action has been set to descend, then we * know that we've already seen a peripheral matching * expression, therefore we need to further descend the tree. * This won't change by continuing around the loop, so we * go ahead and return. If we haven't seen a peripheral * matching expression, we keep going around the loop until * we exhaust the matching expressions. We'll set the stop * flag once we fall out of the loop. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND) return(retval); } /* * If the return action hasn't been set to descend yet, that means * we haven't seen any peripheral matching patterns. So tell the * caller to stop descending the tree -- the user doesn't want to * match against lower level tree elements. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE) retval |= DM_RET_STOP; return(retval); } /* * Match a single peripheral against any number of match patterns. */ static dev_match_ret xptperiphmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_periph *periph) { dev_match_ret retval; u_int i; /* * If we aren't given something to match against, that's an error. */ if (periph == NULL) return(DM_RET_ERROR); /* * If there are no match entries, then this peripheral matches no * matter what. */ if ((patterns == NULL) || (num_patterns == 0)) return(DM_RET_STOP | DM_RET_COPY); /* * There aren't any nodes below a peripheral node, so there's no * reason to descend the tree any further. */ retval = DM_RET_STOP; for (i = 0; i < num_patterns; i++) { struct periph_match_pattern *cur_pattern; /* * If the pattern in question isn't for a peripheral, we * aren't interested. */ if (patterns[i].type != DEV_MATCH_PERIPH) continue; cur_pattern = &patterns[i].pattern.periph_pattern; /* * If they want to match on anything, then we will do so. */ if (cur_pattern->flags == PERIPH_MATCH_ANY) { /* set the copy flag */ retval |= DM_RET_COPY; /* * We've already set the return action to stop, * since there are no nodes below peripherals in * the tree. */ return(retval); } /* * Not sure why someone would do this... */ if (cur_pattern->flags == PERIPH_MATCH_NONE) continue; if (((cur_pattern->flags & PERIPH_MATCH_PATH) != 0) && (cur_pattern->path_id != periph->path->bus->path_id)) continue; /* * For the target and lun id's, we have to make sure the * target and lun pointers aren't NULL. The xpt peripheral * has a wildcard target and device. */ if (((cur_pattern->flags & PERIPH_MATCH_TARGET) != 0) && ((periph->path->target == NULL) ||(cur_pattern->target_id != periph->path->target->target_id))) continue; if (((cur_pattern->flags & PERIPH_MATCH_LUN) != 0) && ((periph->path->device == NULL) || (cur_pattern->target_lun != periph->path->device->lun_id))) continue; if (((cur_pattern->flags & PERIPH_MATCH_UNIT) != 0) && (cur_pattern->unit_number != periph->unit_number)) continue; if (((cur_pattern->flags & PERIPH_MATCH_NAME) != 0) && (strncmp(cur_pattern->periph_name, periph->periph_name, DEV_IDLEN) != 0)) continue; /* * If we get to this point, the user definitely wants * information on this peripheral. So tell the caller to * copy the data out. */ retval |= DM_RET_COPY; /* * The return action has already been set to stop, since * peripherals don't have any nodes below them in the EDT. */ return(retval); } /* * If we get to this point, the peripheral that was passed in * doesn't match any of the patterns. */ return(retval); } static int xptedtbusfunc(struct cam_eb *bus, void *arg) { struct ccb_dev_match *cdm; struct cam_et *target; dev_match_ret retval; cdm = (struct ccb_dev_match *)arg; /* * If our position is for something deeper in the tree, that means * that we've already seen this node. So, we keep going down. */ if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus == bus) && (cdm->pos.position_type & CAM_DEV_POS_TARGET) && (cdm->pos.cookie.target != NULL)) retval = DM_RET_DESCEND; else retval = xptbusmatch(cdm->patterns, cdm->num_patterns, bus); /* * If we got an error, bail out of the search. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } /* * If the copy flag is set, copy this bus out. */ if (retval & DM_RET_COPY) { int spaceleft, j; spaceleft = cdm->match_buf_len - (cdm->num_matches * sizeof(struct dev_match_result)); /* * If we don't have enough space to put in another * match result, save our position and tell the * user there are more devices to check. */ if (spaceleft < sizeof(struct dev_match_result)) { bzero(&cdm->pos, sizeof(cdm->pos)); cdm->pos.position_type = CAM_DEV_POS_EDT | CAM_DEV_POS_BUS; cdm->pos.cookie.bus = bus; cdm->pos.generations[CAM_BUS_GENERATION]= xsoftc.bus_generation; cdm->status = CAM_DEV_MATCH_MORE; return(0); } j = cdm->num_matches; cdm->num_matches++; cdm->matches[j].type = DEV_MATCH_BUS; cdm->matches[j].result.bus_result.path_id = bus->path_id; cdm->matches[j].result.bus_result.bus_id = bus->sim->bus_id; cdm->matches[j].result.bus_result.unit_number = bus->sim->unit_number; strlcpy(cdm->matches[j].result.bus_result.dev_name, bus->sim->sim_name, sizeof(cdm->matches[j].result.bus_result.dev_name)); } /* * If the user is only interested in buses, there's no * reason to descend to the next level in the tree. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP) return(1); /* * If there is a target generation recorded, check it to * make sure the target list hasn't changed. */ mtx_lock(&bus->eb_mtx); if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus == bus) && (cdm->pos.position_type & CAM_DEV_POS_TARGET) && (cdm->pos.cookie.target != NULL)) { if ((cdm->pos.generations[CAM_TARGET_GENERATION] != bus->generation)) { mtx_unlock(&bus->eb_mtx); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return (0); } target = (struct cam_et *)cdm->pos.cookie.target; target->refcount++; } else target = NULL; mtx_unlock(&bus->eb_mtx); return (xpttargettraverse(bus, target, xptedttargetfunc, arg)); } static int xptedttargetfunc(struct cam_et *target, void *arg) { struct ccb_dev_match *cdm; struct cam_eb *bus; struct cam_ed *device; cdm = (struct ccb_dev_match *)arg; bus = target->bus; /* * If there is a device list generation recorded, check it to * make sure the device list hasn't changed. */ mtx_lock(&bus->eb_mtx); if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus == bus) && (cdm->pos.position_type & CAM_DEV_POS_TARGET) && (cdm->pos.cookie.target == target) && (cdm->pos.position_type & CAM_DEV_POS_DEVICE) && (cdm->pos.cookie.device != NULL)) { if (cdm->pos.generations[CAM_DEV_GENERATION] != target->generation) { mtx_unlock(&bus->eb_mtx); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return(0); } device = (struct cam_ed *)cdm->pos.cookie.device; device->refcount++; } else device = NULL; mtx_unlock(&bus->eb_mtx); return (xptdevicetraverse(target, device, xptedtdevicefunc, arg)); } static int xptedtdevicefunc(struct cam_ed *device, void *arg) { struct cam_eb *bus; struct cam_periph *periph; struct ccb_dev_match *cdm; dev_match_ret retval; cdm = (struct ccb_dev_match *)arg; bus = device->target->bus; /* * If our position is for something deeper in the tree, that means * that we've already seen this node. So, we keep going down. */ if ((cdm->pos.position_type & CAM_DEV_POS_DEVICE) && (cdm->pos.cookie.device == device) && (cdm->pos.position_type & CAM_DEV_POS_PERIPH) && (cdm->pos.cookie.periph != NULL)) retval = DM_RET_DESCEND; else retval = xptdevicematch(cdm->patterns, cdm->num_patterns, device); if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } /* * If the copy flag is set, copy this device out. */ if (retval & DM_RET_COPY) { int spaceleft, j; spaceleft = cdm->match_buf_len - (cdm->num_matches * sizeof(struct dev_match_result)); /* * If we don't have enough space to put in another * match result, save our position and tell the * user there are more devices to check. */ if (spaceleft < sizeof(struct dev_match_result)) { bzero(&cdm->pos, sizeof(cdm->pos)); cdm->pos.position_type = CAM_DEV_POS_EDT | CAM_DEV_POS_BUS | CAM_DEV_POS_TARGET | CAM_DEV_POS_DEVICE; cdm->pos.cookie.bus = device->target->bus; cdm->pos.generations[CAM_BUS_GENERATION]= xsoftc.bus_generation; cdm->pos.cookie.target = device->target; cdm->pos.generations[CAM_TARGET_GENERATION] = device->target->bus->generation; cdm->pos.cookie.device = device; cdm->pos.generations[CAM_DEV_GENERATION] = device->target->generation; cdm->status = CAM_DEV_MATCH_MORE; return(0); } j = cdm->num_matches; cdm->num_matches++; cdm->matches[j].type = DEV_MATCH_DEVICE; cdm->matches[j].result.device_result.path_id = device->target->bus->path_id; cdm->matches[j].result.device_result.target_id = device->target->target_id; cdm->matches[j].result.device_result.target_lun = device->lun_id; cdm->matches[j].result.device_result.protocol = device->protocol; bcopy(&device->inq_data, &cdm->matches[j].result.device_result.inq_data, sizeof(struct scsi_inquiry_data)); bcopy(&device->ident_data, &cdm->matches[j].result.device_result.ident_data, sizeof(struct ata_params)); /* Let the user know whether this device is unconfigured */ if (device->flags & CAM_DEV_UNCONFIGURED) cdm->matches[j].result.device_result.flags = DEV_RESULT_UNCONFIGURED; else cdm->matches[j].result.device_result.flags = DEV_RESULT_NOFLAG; } /* * If the user isn't interested in peripherals, don't descend * the tree any further. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP) return(1); /* * If there is a peripheral list generation recorded, make sure * it hasn't changed. */ xpt_lock_buses(); mtx_lock(&bus->eb_mtx); if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus == bus) && (cdm->pos.position_type & CAM_DEV_POS_TARGET) && (cdm->pos.cookie.target == device->target) && (cdm->pos.position_type & CAM_DEV_POS_DEVICE) && (cdm->pos.cookie.device == device) && (cdm->pos.position_type & CAM_DEV_POS_PERIPH) && (cdm->pos.cookie.periph != NULL)) { if (cdm->pos.generations[CAM_PERIPH_GENERATION] != device->generation) { mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return(0); } periph = (struct cam_periph *)cdm->pos.cookie.periph; periph->refcount++; } else periph = NULL; mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); return (xptperiphtraverse(device, periph, xptedtperiphfunc, arg)); } static int xptedtperiphfunc(struct cam_periph *periph, void *arg) { struct ccb_dev_match *cdm; dev_match_ret retval; cdm = (struct ccb_dev_match *)arg; retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph); if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } /* * If the copy flag is set, copy this peripheral out. */ if (retval & DM_RET_COPY) { int spaceleft, j; size_t l; spaceleft = cdm->match_buf_len - (cdm->num_matches * sizeof(struct dev_match_result)); /* * If we don't have enough space to put in another * match result, save our position and tell the * user there are more devices to check. */ if (spaceleft < sizeof(struct dev_match_result)) { bzero(&cdm->pos, sizeof(cdm->pos)); cdm->pos.position_type = CAM_DEV_POS_EDT | CAM_DEV_POS_BUS | CAM_DEV_POS_TARGET | CAM_DEV_POS_DEVICE | CAM_DEV_POS_PERIPH; cdm->pos.cookie.bus = periph->path->bus; cdm->pos.generations[CAM_BUS_GENERATION]= xsoftc.bus_generation; cdm->pos.cookie.target = periph->path->target; cdm->pos.generations[CAM_TARGET_GENERATION] = periph->path->bus->generation; cdm->pos.cookie.device = periph->path->device; cdm->pos.generations[CAM_DEV_GENERATION] = periph->path->target->generation; cdm->pos.cookie.periph = periph; cdm->pos.generations[CAM_PERIPH_GENERATION] = periph->path->device->generation; cdm->status = CAM_DEV_MATCH_MORE; return(0); } j = cdm->num_matches; cdm->num_matches++; cdm->matches[j].type = DEV_MATCH_PERIPH; cdm->matches[j].result.periph_result.path_id = periph->path->bus->path_id; cdm->matches[j].result.periph_result.target_id = periph->path->target->target_id; cdm->matches[j].result.periph_result.target_lun = periph->path->device->lun_id; cdm->matches[j].result.periph_result.unit_number = periph->unit_number; l = sizeof(cdm->matches[j].result.periph_result.periph_name); strlcpy(cdm->matches[j].result.periph_result.periph_name, periph->periph_name, l); } return(1); } static int xptedtmatch(struct ccb_dev_match *cdm) { struct cam_eb *bus; int ret; cdm->num_matches = 0; /* * Check the bus list generation. If it has changed, the user * needs to reset everything and start over. */ xpt_lock_buses(); if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus != NULL)) { if (cdm->pos.generations[CAM_BUS_GENERATION] != xsoftc.bus_generation) { xpt_unlock_buses(); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return(0); } bus = (struct cam_eb *)cdm->pos.cookie.bus; bus->refcount++; } else bus = NULL; xpt_unlock_buses(); ret = xptbustraverse(bus, xptedtbusfunc, cdm); /* * If we get back 0, that means that we had to stop before fully * traversing the EDT. It also means that one of the subroutines * has set the status field to the proper value. If we get back 1, * we've fully traversed the EDT and copied out any matching entries. */ if (ret == 1) cdm->status = CAM_DEV_MATCH_LAST; return(ret); } static int xptplistpdrvfunc(struct periph_driver **pdrv, void *arg) { struct cam_periph *periph; struct ccb_dev_match *cdm; cdm = (struct ccb_dev_match *)arg; xpt_lock_buses(); if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR) && (cdm->pos.cookie.pdrv == pdrv) && (cdm->pos.position_type & CAM_DEV_POS_PERIPH) && (cdm->pos.cookie.periph != NULL)) { if (cdm->pos.generations[CAM_PERIPH_GENERATION] != (*pdrv)->generation) { xpt_unlock_buses(); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return(0); } periph = (struct cam_periph *)cdm->pos.cookie.periph; periph->refcount++; } else periph = NULL; xpt_unlock_buses(); return (xptpdperiphtraverse(pdrv, periph, xptplistperiphfunc, arg)); } static int xptplistperiphfunc(struct cam_periph *periph, void *arg) { struct ccb_dev_match *cdm; dev_match_ret retval; cdm = (struct ccb_dev_match *)arg; retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph); if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } /* * If the copy flag is set, copy this peripheral out. */ if (retval & DM_RET_COPY) { int spaceleft, j; size_t l; spaceleft = cdm->match_buf_len - (cdm->num_matches * sizeof(struct dev_match_result)); /* * If we don't have enough space to put in another * match result, save our position and tell the * user there are more devices to check. */ if (spaceleft < sizeof(struct dev_match_result)) { struct periph_driver **pdrv; pdrv = NULL; bzero(&cdm->pos, sizeof(cdm->pos)); cdm->pos.position_type = CAM_DEV_POS_PDRV | CAM_DEV_POS_PDPTR | CAM_DEV_POS_PERIPH; /* * This may look a bit non-sensical, but it is * actually quite logical. There are very few * peripheral drivers, and bloating every peripheral * structure with a pointer back to its parent * peripheral driver linker set entry would cost * more in the long run than doing this quick lookup. */ for (pdrv = periph_drivers; *pdrv != NULL; pdrv++) { if (strcmp((*pdrv)->driver_name, periph->periph_name) == 0) break; } if (*pdrv == NULL) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } cdm->pos.cookie.pdrv = pdrv; /* * The periph generation slot does double duty, as * does the periph pointer slot. They are used for * both edt and pdrv lookups and positioning. */ cdm->pos.cookie.periph = periph; cdm->pos.generations[CAM_PERIPH_GENERATION] = (*pdrv)->generation; cdm->status = CAM_DEV_MATCH_MORE; return(0); } j = cdm->num_matches; cdm->num_matches++; cdm->matches[j].type = DEV_MATCH_PERIPH; cdm->matches[j].result.periph_result.path_id = periph->path->bus->path_id; /* * The transport layer peripheral doesn't have a target or * lun. */ if (periph->path->target) cdm->matches[j].result.periph_result.target_id = periph->path->target->target_id; else cdm->matches[j].result.periph_result.target_id = CAM_TARGET_WILDCARD; if (periph->path->device) cdm->matches[j].result.periph_result.target_lun = periph->path->device->lun_id; else cdm->matches[j].result.periph_result.target_lun = CAM_LUN_WILDCARD; cdm->matches[j].result.periph_result.unit_number = periph->unit_number; l = sizeof(cdm->matches[j].result.periph_result.periph_name); strlcpy(cdm->matches[j].result.periph_result.periph_name, periph->periph_name, l); } return(1); } static int xptperiphlistmatch(struct ccb_dev_match *cdm) { int ret; cdm->num_matches = 0; /* * At this point in the edt traversal function, we check the bus * list generation to make sure that no buses have been added or * removed since the user last sent a XPT_DEV_MATCH ccb through. * For the peripheral driver list traversal function, however, we * don't have to worry about new peripheral driver types coming or * going; they're in a linker set, and therefore can't change * without a recompile. */ if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR) && (cdm->pos.cookie.pdrv != NULL)) ret = xptpdrvtraverse( (struct periph_driver **)cdm->pos.cookie.pdrv, xptplistpdrvfunc, cdm); else ret = xptpdrvtraverse(NULL, xptplistpdrvfunc, cdm); /* * If we get back 0, that means that we had to stop before fully * traversing the peripheral driver tree. It also means that one of * the subroutines has set the status field to the proper value. If * we get back 1, we've fully traversed the EDT and copied out any * matching entries. */ if (ret == 1) cdm->status = CAM_DEV_MATCH_LAST; return(ret); } static int xptbustraverse(struct cam_eb *start_bus, xpt_busfunc_t *tr_func, void *arg) { struct cam_eb *bus, *next_bus; int retval; retval = 1; if (start_bus) bus = start_bus; else { xpt_lock_buses(); bus = TAILQ_FIRST(&xsoftc.xpt_busses); if (bus == NULL) { xpt_unlock_buses(); return (retval); } bus->refcount++; xpt_unlock_buses(); } for (; bus != NULL; bus = next_bus) { retval = tr_func(bus, arg); if (retval == 0) { xpt_release_bus(bus); break; } xpt_lock_buses(); next_bus = TAILQ_NEXT(bus, links); if (next_bus) next_bus->refcount++; xpt_unlock_buses(); xpt_release_bus(bus); } return(retval); } static int xpttargettraverse(struct cam_eb *bus, struct cam_et *start_target, xpt_targetfunc_t *tr_func, void *arg) { struct cam_et *target, *next_target; int retval; retval = 1; if (start_target) target = start_target; else { mtx_lock(&bus->eb_mtx); target = TAILQ_FIRST(&bus->et_entries); if (target == NULL) { mtx_unlock(&bus->eb_mtx); return (retval); } target->refcount++; mtx_unlock(&bus->eb_mtx); } for (; target != NULL; target = next_target) { retval = tr_func(target, arg); if (retval == 0) { xpt_release_target(target); break; } mtx_lock(&bus->eb_mtx); next_target = TAILQ_NEXT(target, links); if (next_target) next_target->refcount++; mtx_unlock(&bus->eb_mtx); xpt_release_target(target); } return(retval); } static int xptdevicetraverse(struct cam_et *target, struct cam_ed *start_device, xpt_devicefunc_t *tr_func, void *arg) { struct cam_eb *bus; struct cam_ed *device, *next_device; int retval; retval = 1; bus = target->bus; if (start_device) device = start_device; else { mtx_lock(&bus->eb_mtx); device = TAILQ_FIRST(&target->ed_entries); if (device == NULL) { mtx_unlock(&bus->eb_mtx); return (retval); } device->refcount++; mtx_unlock(&bus->eb_mtx); } for (; device != NULL; device = next_device) { mtx_lock(&device->device_mtx); retval = tr_func(device, arg); mtx_unlock(&device->device_mtx); if (retval == 0) { xpt_release_device(device); break; } mtx_lock(&bus->eb_mtx); next_device = TAILQ_NEXT(device, links); if (next_device) next_device->refcount++; mtx_unlock(&bus->eb_mtx); xpt_release_device(device); } return(retval); } static int xptperiphtraverse(struct cam_ed *device, struct cam_periph *start_periph, xpt_periphfunc_t *tr_func, void *arg) { struct cam_eb *bus; struct cam_periph *periph, *next_periph; int retval; retval = 1; bus = device->target->bus; if (start_periph) periph = start_periph; else { xpt_lock_buses(); mtx_lock(&bus->eb_mtx); periph = SLIST_FIRST(&device->periphs); while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0) periph = SLIST_NEXT(periph, periph_links); if (periph == NULL) { mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); return (retval); } periph->refcount++; mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); } for (; periph != NULL; periph = next_periph) { retval = tr_func(periph, arg); if (retval == 0) { cam_periph_release_locked(periph); break; } xpt_lock_buses(); mtx_lock(&bus->eb_mtx); next_periph = SLIST_NEXT(periph, periph_links); while (next_periph != NULL && (next_periph->flags & CAM_PERIPH_FREE) != 0) next_periph = SLIST_NEXT(next_periph, periph_links); if (next_periph) next_periph->refcount++; mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); cam_periph_release_locked(periph); } return(retval); } static int xptpdrvtraverse(struct periph_driver **start_pdrv, xpt_pdrvfunc_t *tr_func, void *arg) { struct periph_driver **pdrv; int retval; retval = 1; /* * We don't traverse the peripheral driver list like we do the * other lists, because it is a linker set, and therefore cannot be * changed during runtime. If the peripheral driver list is ever * re-done to be something other than a linker set (i.e. it can * change while the system is running), the list traversal should * be modified to work like the other traversal functions. */ for (pdrv = (start_pdrv ? start_pdrv : periph_drivers); *pdrv != NULL; pdrv++) { retval = tr_func(pdrv, arg); if (retval == 0) return(retval); } return(retval); } static int xptpdperiphtraverse(struct periph_driver **pdrv, struct cam_periph *start_periph, xpt_periphfunc_t *tr_func, void *arg) { struct cam_periph *periph, *next_periph; int retval; retval = 1; if (start_periph) periph = start_periph; else { xpt_lock_buses(); periph = TAILQ_FIRST(&(*pdrv)->units); while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0) periph = TAILQ_NEXT(periph, unit_links); if (periph == NULL) { xpt_unlock_buses(); return (retval); } periph->refcount++; xpt_unlock_buses(); } for (; periph != NULL; periph = next_periph) { cam_periph_lock(periph); retval = tr_func(periph, arg); cam_periph_unlock(periph); if (retval == 0) { cam_periph_release(periph); break; } xpt_lock_buses(); next_periph = TAILQ_NEXT(periph, unit_links); while (next_periph != NULL && (next_periph->flags & CAM_PERIPH_FREE) != 0) next_periph = TAILQ_NEXT(next_periph, unit_links); if (next_periph) next_periph->refcount++; xpt_unlock_buses(); cam_periph_release(periph); } return(retval); } static int xptdefbusfunc(struct cam_eb *bus, void *arg) { struct xpt_traverse_config *tr_config; tr_config = (struct xpt_traverse_config *)arg; if (tr_config->depth == XPT_DEPTH_BUS) { xpt_busfunc_t *tr_func; tr_func = (xpt_busfunc_t *)tr_config->tr_func; return(tr_func(bus, tr_config->tr_arg)); } else return(xpttargettraverse(bus, NULL, xptdeftargetfunc, arg)); } static int xptdeftargetfunc(struct cam_et *target, void *arg) { struct xpt_traverse_config *tr_config; tr_config = (struct xpt_traverse_config *)arg; if (tr_config->depth == XPT_DEPTH_TARGET) { xpt_targetfunc_t *tr_func; tr_func = (xpt_targetfunc_t *)tr_config->tr_func; return(tr_func(target, tr_config->tr_arg)); } else return(xptdevicetraverse(target, NULL, xptdefdevicefunc, arg)); } static int xptdefdevicefunc(struct cam_ed *device, void *arg) { struct xpt_traverse_config *tr_config; tr_config = (struct xpt_traverse_config *)arg; if (tr_config->depth == XPT_DEPTH_DEVICE) { xpt_devicefunc_t *tr_func; tr_func = (xpt_devicefunc_t *)tr_config->tr_func; return(tr_func(device, tr_config->tr_arg)); } else return(xptperiphtraverse(device, NULL, xptdefperiphfunc, arg)); } static int xptdefperiphfunc(struct cam_periph *periph, void *arg) { struct xpt_traverse_config *tr_config; xpt_periphfunc_t *tr_func; tr_config = (struct xpt_traverse_config *)arg; tr_func = (xpt_periphfunc_t *)tr_config->tr_func; /* * Unlike the other default functions, we don't check for depth * here. The peripheral driver level is the last level in the EDT, * so if we're here, we should execute the function in question. */ return(tr_func(periph, tr_config->tr_arg)); } /* * Execute the given function for every bus in the EDT. */ static int xpt_for_all_busses(xpt_busfunc_t *tr_func, void *arg) { struct xpt_traverse_config tr_config; tr_config.depth = XPT_DEPTH_BUS; tr_config.tr_func = tr_func; tr_config.tr_arg = arg; return(xptbustraverse(NULL, xptdefbusfunc, &tr_config)); } /* * Execute the given function for every device in the EDT. */ static int xpt_for_all_devices(xpt_devicefunc_t *tr_func, void *arg) { struct xpt_traverse_config tr_config; tr_config.depth = XPT_DEPTH_DEVICE; tr_config.tr_func = tr_func; tr_config.tr_arg = arg; return(xptbustraverse(NULL, xptdefbusfunc, &tr_config)); } static int xptsetasyncfunc(struct cam_ed *device, void *arg) { struct cam_path path; struct ccb_getdev cgd; struct ccb_setasync *csa = (struct ccb_setasync *)arg; /* * Don't report unconfigured devices (Wildcard devs, * devices only for target mode, device instances * that have been invalidated but are waiting for * their last reference count to be released). */ if ((device->flags & CAM_DEV_UNCONFIGURED) != 0) return (1); xpt_compile_path(&path, NULL, device->target->bus->path_id, device->target->target_id, device->lun_id); xpt_setup_ccb(&cgd.ccb_h, &path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); csa->callback(csa->callback_arg, AC_FOUND_DEVICE, &path, &cgd); xpt_release_path(&path); return(1); } static int xptsetasyncbusfunc(struct cam_eb *bus, void *arg) { struct cam_path path; struct ccb_pathinq cpi; struct ccb_setasync *csa = (struct ccb_setasync *)arg; xpt_compile_path(&path, /*periph*/NULL, bus->path_id, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); xpt_path_lock(&path); xpt_path_inq(&cpi, &path); csa->callback(csa->callback_arg, AC_PATH_REGISTERED, &path, &cpi); xpt_path_unlock(&path); xpt_release_path(&path); return(1); } void xpt_action(union ccb *start_ccb) { CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code))); start_ccb->ccb_h.status = CAM_REQ_INPROG; (*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb); } void xpt_action_default(union ccb *start_ccb) { struct cam_path *path; struct cam_sim *sim; struct mtx *mtx; path = start_ccb->ccb_h.path; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_action_default: func %#x %s\n", start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code))); switch (start_ccb->ccb_h.func_code) { case XPT_SCSI_IO: { struct cam_ed *device; /* * For the sake of compatibility with SCSI-1 * devices that may not understand the identify * message, we include lun information in the * second byte of all commands. SCSI-1 specifies * that luns are a 3 bit value and reserves only 3 * bits for lun information in the CDB. Later * revisions of the SCSI spec allow for more than 8 * luns, but have deprecated lun information in the * CDB. So, if the lun won't fit, we must omit. * * Also be aware that during initial probing for devices, * the inquiry information is unknown but initialized to 0. * This means that this code will be exercised while probing * devices with an ANSI revision greater than 2. */ device = path->device; if (device->protocol_version <= SCSI_REV_2 && start_ccb->ccb_h.target_lun < 8 && (start_ccb->ccb_h.flags & CAM_CDB_POINTER) == 0) { start_ccb->csio.cdb_io.cdb_bytes[1] |= start_ccb->ccb_h.target_lun << 5; } start_ccb->csio.scsi_status = SCSI_STATUS_OK; } /* FALLTHROUGH */ case XPT_TARGET_IO: case XPT_CONT_TARGET_IO: start_ccb->csio.sense_resid = 0; start_ccb->csio.resid = 0; /* FALLTHROUGH */ case XPT_ATA_IO: if (start_ccb->ccb_h.func_code == XPT_ATA_IO) start_ccb->ataio.resid = 0; /* FALLTHROUGH */ case XPT_NVME_IO: /* FALLTHROUGH */ case XPT_NVME_ADMIN: /* FALLTHROUGH */ case XPT_MMC_IO: /* XXX just like nmve_io? */ case XPT_RESET_DEV: case XPT_ENG_EXEC: case XPT_SMP_IO: { struct cam_devq *devq; devq = path->bus->sim->devq; mtx_lock(&devq->send_mtx); cam_ccbq_insert_ccb(&path->device->ccbq, start_ccb); if (xpt_schedule_devq(devq, path->device) != 0) xpt_run_devq(devq); mtx_unlock(&devq->send_mtx); break; } case XPT_CALC_GEOMETRY: /* Filter out garbage */ if (start_ccb->ccg.block_size == 0 || start_ccb->ccg.volume_size == 0) { start_ccb->ccg.cylinders = 0; start_ccb->ccg.heads = 0; start_ccb->ccg.secs_per_track = 0; start_ccb->ccb_h.status = CAM_REQ_CMP; break; } #if defined(__sparc64__) /* * For sparc64, we may need adjust the geometry of large * disks in order to fit the limitations of the 16-bit * fields of the VTOC8 disk label. */ if (scsi_da_bios_params(&start_ccb->ccg) != 0) { start_ccb->ccb_h.status = CAM_REQ_CMP; break; } #endif goto call_sim; case XPT_ABORT: { union ccb* abort_ccb; abort_ccb = start_ccb->cab.abort_ccb; if (XPT_FC_IS_DEV_QUEUED(abort_ccb)) { struct cam_ed *device; struct cam_devq *devq; device = abort_ccb->ccb_h.path->device; devq = device->sim->devq; mtx_lock(&devq->send_mtx); if (abort_ccb->ccb_h.pinfo.index > 0) { cam_ccbq_remove_ccb(&device->ccbq, abort_ccb); abort_ccb->ccb_h.status = CAM_REQ_ABORTED|CAM_DEV_QFRZN; xpt_freeze_devq_device(device, 1); mtx_unlock(&devq->send_mtx); xpt_done(abort_ccb); start_ccb->ccb_h.status = CAM_REQ_CMP; break; } mtx_unlock(&devq->send_mtx); if (abort_ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX && (abort_ccb->ccb_h.status & CAM_SIM_QUEUED) == 0) { /* * We've caught this ccb en route to * the SIM. Flag it for abort and the * SIM will do so just before starting * real work on the CCB. */ abort_ccb->ccb_h.status = CAM_REQ_ABORTED|CAM_DEV_QFRZN; xpt_freeze_devq(abort_ccb->ccb_h.path, 1); start_ccb->ccb_h.status = CAM_REQ_CMP; break; } } if (XPT_FC_IS_QUEUED(abort_ccb) && (abort_ccb->ccb_h.pinfo.index == CAM_DONEQ_INDEX)) { /* * It's already completed but waiting * for our SWI to get to it. */ start_ccb->ccb_h.status = CAM_UA_ABORT; break; } /* * If we weren't able to take care of the abort request * in the XPT, pass the request down to the SIM for processing. */ } /* FALLTHROUGH */ case XPT_ACCEPT_TARGET_IO: case XPT_EN_LUN: case XPT_IMMED_NOTIFY: case XPT_NOTIFY_ACK: case XPT_RESET_BUS: case XPT_IMMEDIATE_NOTIFY: case XPT_NOTIFY_ACKNOWLEDGE: case XPT_GET_SIM_KNOB_OLD: case XPT_GET_SIM_KNOB: case XPT_SET_SIM_KNOB: case XPT_GET_TRAN_SETTINGS: case XPT_SET_TRAN_SETTINGS: case XPT_PATH_INQ: call_sim: sim = path->bus->sim; mtx = sim->mtx; if (mtx && !mtx_owned(mtx)) mtx_lock(mtx); else mtx = NULL; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("Calling sim->sim_action(): func=%#x\n", start_ccb->ccb_h.func_code)); (*(sim->sim_action))(sim, start_ccb); CAM_DEBUG(path, CAM_DEBUG_TRACE, ("sim->sim_action returned: status=%#x\n", start_ccb->ccb_h.status)); if (mtx) mtx_unlock(mtx); break; case XPT_PATH_STATS: start_ccb->cpis.last_reset = path->bus->last_reset; start_ccb->ccb_h.status = CAM_REQ_CMP; break; case XPT_GDEV_TYPE: { struct cam_ed *dev; dev = path->device; if ((dev->flags & CAM_DEV_UNCONFIGURED) != 0) { start_ccb->ccb_h.status = CAM_DEV_NOT_THERE; } else { struct ccb_getdev *cgd; cgd = &start_ccb->cgd; cgd->protocol = dev->protocol; cgd->inq_data = dev->inq_data; cgd->ident_data = dev->ident_data; cgd->inq_flags = dev->inq_flags; cgd->ccb_h.status = CAM_REQ_CMP; cgd->serial_num_len = dev->serial_num_len; if ((dev->serial_num_len > 0) && (dev->serial_num != NULL)) bcopy(dev->serial_num, cgd->serial_num, dev->serial_num_len); } break; } case XPT_GDEV_STATS: { struct ccb_getdevstats *cgds = &start_ccb->cgds; struct cam_ed *dev = path->device; struct cam_eb *bus = path->bus; struct cam_et *tar = path->target; struct cam_devq *devq = bus->sim->devq; mtx_lock(&devq->send_mtx); cgds->dev_openings = dev->ccbq.dev_openings; cgds->dev_active = dev->ccbq.dev_active; cgds->allocated = dev->ccbq.allocated; cgds->queued = cam_ccbq_pending_ccb_count(&dev->ccbq); cgds->held = cgds->allocated - cgds->dev_active - cgds->queued; cgds->last_reset = tar->last_reset; cgds->maxtags = dev->maxtags; cgds->mintags = dev->mintags; if (timevalcmp(&tar->last_reset, &bus->last_reset, <)) cgds->last_reset = bus->last_reset; mtx_unlock(&devq->send_mtx); cgds->ccb_h.status = CAM_REQ_CMP; break; } case XPT_GDEVLIST: { struct cam_periph *nperiph; struct periph_list *periph_head; struct ccb_getdevlist *cgdl; u_int i; struct cam_ed *device; int found; found = 0; /* * Don't want anyone mucking with our data. */ device = path->device; periph_head = &device->periphs; cgdl = &start_ccb->cgdl; /* * Check and see if the list has changed since the user * last requested a list member. If so, tell them that the * list has changed, and therefore they need to start over * from the beginning. */ if ((cgdl->index != 0) && (cgdl->generation != device->generation)) { cgdl->status = CAM_GDEVLIST_LIST_CHANGED; break; } /* * Traverse the list of peripherals and attempt to find * the requested peripheral. */ for (nperiph = SLIST_FIRST(periph_head), i = 0; (nperiph != NULL) && (i <= cgdl->index); nperiph = SLIST_NEXT(nperiph, periph_links), i++) { if (i == cgdl->index) { strlcpy(cgdl->periph_name, nperiph->periph_name, sizeof(cgdl->periph_name)); cgdl->unit_number = nperiph->unit_number; found = 1; } } if (found == 0) { cgdl->status = CAM_GDEVLIST_ERROR; break; } if (nperiph == NULL) cgdl->status = CAM_GDEVLIST_LAST_DEVICE; else cgdl->status = CAM_GDEVLIST_MORE_DEVS; cgdl->index++; cgdl->generation = device->generation; cgdl->ccb_h.status = CAM_REQ_CMP; break; } case XPT_DEV_MATCH: { dev_pos_type position_type; struct ccb_dev_match *cdm; cdm = &start_ccb->cdm; /* * There are two ways of getting at information in the EDT. * The first way is via the primary EDT tree. It starts * with a list of buses, then a list of targets on a bus, * then devices/luns on a target, and then peripherals on a * device/lun. The "other" way is by the peripheral driver * lists. The peripheral driver lists are organized by * peripheral driver. (obviously) So it makes sense to * use the peripheral driver list if the user is looking * for something like "da1", or all "da" devices. If the * user is looking for something on a particular bus/target * or lun, it's generally better to go through the EDT tree. */ if (cdm->pos.position_type != CAM_DEV_POS_NONE) position_type = cdm->pos.position_type; else { u_int i; position_type = CAM_DEV_POS_NONE; for (i = 0; i < cdm->num_patterns; i++) { if ((cdm->patterns[i].type == DEV_MATCH_BUS) ||(cdm->patterns[i].type == DEV_MATCH_DEVICE)){ position_type = CAM_DEV_POS_EDT; break; } } if (cdm->num_patterns == 0) position_type = CAM_DEV_POS_EDT; else if (position_type == CAM_DEV_POS_NONE) position_type = CAM_DEV_POS_PDRV; } switch(position_type & CAM_DEV_POS_TYPEMASK) { case CAM_DEV_POS_EDT: xptedtmatch(cdm); break; case CAM_DEV_POS_PDRV: xptperiphlistmatch(cdm); break; default: cdm->status = CAM_DEV_MATCH_ERROR; break; } if (cdm->status == CAM_DEV_MATCH_ERROR) start_ccb->ccb_h.status = CAM_REQ_CMP_ERR; else start_ccb->ccb_h.status = CAM_REQ_CMP; break; } case XPT_SASYNC_CB: { struct ccb_setasync *csa; struct async_node *cur_entry; struct async_list *async_head; u_int32_t added; csa = &start_ccb->csa; added = csa->event_enable; async_head = &path->device->asyncs; /* * If there is already an entry for us, simply * update it. */ cur_entry = SLIST_FIRST(async_head); while (cur_entry != NULL) { if ((cur_entry->callback_arg == csa->callback_arg) && (cur_entry->callback == csa->callback)) break; cur_entry = SLIST_NEXT(cur_entry, links); } if (cur_entry != NULL) { /* * If the request has no flags set, * remove the entry. */ added &= ~cur_entry->event_enable; if (csa->event_enable == 0) { SLIST_REMOVE(async_head, cur_entry, async_node, links); xpt_release_device(path->device); free(cur_entry, M_CAMXPT); } else { cur_entry->event_enable = csa->event_enable; } csa->event_enable = added; } else { cur_entry = malloc(sizeof(*cur_entry), M_CAMXPT, M_NOWAIT); if (cur_entry == NULL) { csa->ccb_h.status = CAM_RESRC_UNAVAIL; break; } cur_entry->event_enable = csa->event_enable; cur_entry->event_lock = (path->bus->sim->mtx && mtx_owned(path->bus->sim->mtx)) ? 1 : 0; cur_entry->callback_arg = csa->callback_arg; cur_entry->callback = csa->callback; SLIST_INSERT_HEAD(async_head, cur_entry, links); xpt_acquire_device(path->device); } start_ccb->ccb_h.status = CAM_REQ_CMP; break; } case XPT_REL_SIMQ: { struct ccb_relsim *crs; struct cam_ed *dev; crs = &start_ccb->crs; dev = path->device; if (dev == NULL) { crs->ccb_h.status = CAM_DEV_NOT_THERE; break; } if ((crs->release_flags & RELSIM_ADJUST_OPENINGS) != 0) { /* Don't ever go below one opening */ if (crs->openings > 0) { xpt_dev_ccbq_resize(path, crs->openings); if (bootverbose) { xpt_print(path, "number of openings is now %d\n", crs->openings); } } } mtx_lock(&dev->sim->devq->send_mtx); if ((crs->release_flags & RELSIM_RELEASE_AFTER_TIMEOUT) != 0) { if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) { /* * Just extend the old timeout and decrement * the freeze count so that a single timeout * is sufficient for releasing the queue. */ start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE; callout_stop(&dev->callout); } else { start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE; } callout_reset_sbt(&dev->callout, SBT_1MS * crs->release_timeout, 0, xpt_release_devq_timeout, dev, 0); dev->flags |= CAM_DEV_REL_TIMEOUT_PENDING; } if ((crs->release_flags & RELSIM_RELEASE_AFTER_CMDCMPLT) != 0) { if ((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0) { /* * Decrement the freeze count so that a single * completion is still sufficient to unfreeze * the queue. */ start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE; } else { dev->flags |= CAM_DEV_REL_ON_COMPLETE; start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE; } } if ((crs->release_flags & RELSIM_RELEASE_AFTER_QEMPTY) != 0) { if ((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0 || (dev->ccbq.dev_active == 0)) { start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE; } else { dev->flags |= CAM_DEV_REL_ON_QUEUE_EMPTY; start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE; } } mtx_unlock(&dev->sim->devq->send_mtx); if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) == 0) xpt_release_devq(path, /*count*/1, /*run_queue*/TRUE); start_ccb->crs.qfrozen_cnt = dev->ccbq.queue.qfrozen_cnt; start_ccb->ccb_h.status = CAM_REQ_CMP; break; } case XPT_DEBUG: { struct cam_path *oldpath; /* Check that all request bits are supported. */ if (start_ccb->cdbg.flags & ~(CAM_DEBUG_COMPILE)) { start_ccb->ccb_h.status = CAM_FUNC_NOTAVAIL; break; } cam_dflags = CAM_DEBUG_NONE; if (cam_dpath != NULL) { oldpath = cam_dpath; cam_dpath = NULL; xpt_free_path(oldpath); } if (start_ccb->cdbg.flags != CAM_DEBUG_NONE) { if (xpt_create_path(&cam_dpath, NULL, start_ccb->ccb_h.path_id, start_ccb->ccb_h.target_id, start_ccb->ccb_h.target_lun) != CAM_REQ_CMP) { start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL; } else { cam_dflags = start_ccb->cdbg.flags; start_ccb->ccb_h.status = CAM_REQ_CMP; xpt_print(cam_dpath, "debugging flags now %x\n", cam_dflags); } } else start_ccb->ccb_h.status = CAM_REQ_CMP; break; } case XPT_NOOP: if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0) xpt_freeze_devq(path, 1); start_ccb->ccb_h.status = CAM_REQ_CMP; break; case XPT_REPROBE_LUN: xpt_async(AC_INQ_CHANGED, path, NULL); start_ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(start_ccb); break; default: case XPT_SDEV_TYPE: case XPT_TERM_IO: case XPT_ENG_INQ: /* XXX Implement */ xpt_print(start_ccb->ccb_h.path, "%s: CCB type %#x %s not supported\n", __func__, start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code)); start_ccb->ccb_h.status = CAM_PROVIDE_FAIL; if (start_ccb->ccb_h.func_code & XPT_FC_DEV_QUEUED) { xpt_done(start_ccb); } break; } CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_action_default: func= %#x %s status %#x\n", start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code), start_ccb->ccb_h.status)); } /* * Call the sim poll routine to allow the sim to complete * any inflight requests, then call camisr_runqueue to * complete any CCB that the polling completed. */ void xpt_sim_poll(struct cam_sim *sim) { struct mtx *mtx; mtx = sim->mtx; if (mtx) mtx_lock(mtx); (*(sim->sim_poll))(sim); if (mtx) mtx_unlock(mtx); camisr_runqueue(); } uint32_t xpt_poll_setup(union ccb *start_ccb) { u_int32_t timeout; struct cam_sim *sim; struct cam_devq *devq; struct cam_ed *dev; timeout = start_ccb->ccb_h.timeout * 10; sim = start_ccb->ccb_h.path->bus->sim; devq = sim->devq; dev = start_ccb->ccb_h.path->device; /* * Steal an opening so that no other queued requests * can get it before us while we simulate interrupts. */ mtx_lock(&devq->send_mtx); dev->ccbq.dev_openings--; while((devq->send_openings <= 0 || dev->ccbq.dev_openings < 0) && (--timeout > 0)) { mtx_unlock(&devq->send_mtx); DELAY(100); xpt_sim_poll(sim); mtx_lock(&devq->send_mtx); } dev->ccbq.dev_openings++; mtx_unlock(&devq->send_mtx); return (timeout); } void xpt_pollwait(union ccb *start_ccb, uint32_t timeout) { while (--timeout > 0) { xpt_sim_poll(start_ccb->ccb_h.path->bus->sim); if ((start_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG) break; DELAY(100); } if (timeout == 0) { /* * XXX Is it worth adding a sim_timeout entry * point so we can attempt recovery? If * this is only used for dumps, I don't think * it is. */ start_ccb->ccb_h.status = CAM_CMD_TIMEOUT; } } void xpt_polled_action(union ccb *start_ccb) { uint32_t timeout; struct cam_ed *dev; timeout = start_ccb->ccb_h.timeout * 10; dev = start_ccb->ccb_h.path->device; mtx_unlock(&dev->device_mtx); timeout = xpt_poll_setup(start_ccb); if (timeout > 0) { xpt_action(start_ccb); xpt_pollwait(start_ccb, timeout); } else { start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL; } mtx_lock(&dev->device_mtx); } /* * Schedule a peripheral driver to receive a ccb when its * target device has space for more transactions. */ void xpt_schedule(struct cam_periph *periph, u_int32_t new_priority) { CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("xpt_schedule\n")); cam_periph_assert(periph, MA_OWNED); if (new_priority < periph->scheduled_priority) { periph->scheduled_priority = new_priority; xpt_run_allocq(periph, 0); } } /* * Schedule a device to run on a given queue. * If the device was inserted as a new entry on the queue, * return 1 meaning the device queue should be run. If we * were already queued, implying someone else has already * started the queue, return 0 so the caller doesn't attempt * to run the queue. */ static int xpt_schedule_dev(struct camq *queue, cam_pinfo *pinfo, u_int32_t new_priority) { int retval; u_int32_t old_priority; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_schedule_dev\n")); old_priority = pinfo->priority; /* * Are we already queued? */ if (pinfo->index != CAM_UNQUEUED_INDEX) { /* Simply reorder based on new priority */ if (new_priority < old_priority) { camq_change_priority(queue, pinfo->index, new_priority); CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("changed priority to %d\n", new_priority)); retval = 1; } else retval = 0; } else { /* New entry on the queue */ if (new_priority < old_priority) pinfo->priority = new_priority; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("Inserting onto queue\n")); pinfo->generation = ++queue->generation; camq_insert(queue, pinfo); retval = 1; } return (retval); } static void xpt_run_allocq_task(void *context, int pending) { struct cam_periph *periph = context; cam_periph_lock(periph); periph->flags &= ~CAM_PERIPH_RUN_TASK; xpt_run_allocq(periph, 1); cam_periph_unlock(periph); cam_periph_release(periph); } static void xpt_run_allocq(struct cam_periph *periph, int sleep) { struct cam_ed *device; union ccb *ccb; uint32_t prio; cam_periph_assert(periph, MA_OWNED); if (periph->periph_allocating) return; cam_periph_doacquire(periph); periph->periph_allocating = 1; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_allocq(%p)\n", periph)); device = periph->path->device; ccb = NULL; restart: while ((prio = min(periph->scheduled_priority, periph->immediate_priority)) != CAM_PRIORITY_NONE && (periph->periph_allocated - (ccb != NULL ? 1 : 0) < device->ccbq.total_openings || prio <= CAM_PRIORITY_OOB)) { if (ccb == NULL && (ccb = xpt_get_ccb_nowait(periph)) == NULL) { if (sleep) { ccb = xpt_get_ccb(periph); goto restart; } if (periph->flags & CAM_PERIPH_RUN_TASK) break; cam_periph_doacquire(periph); periph->flags |= CAM_PERIPH_RUN_TASK; taskqueue_enqueue(xsoftc.xpt_taskq, &periph->periph_run_task); break; } xpt_setup_ccb(&ccb->ccb_h, periph->path, prio); if (prio == periph->immediate_priority) { periph->immediate_priority = CAM_PRIORITY_NONE; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("waking cam_periph_getccb()\n")); SLIST_INSERT_HEAD(&periph->ccb_list, &ccb->ccb_h, periph_links.sle); wakeup(&periph->ccb_list); } else { periph->scheduled_priority = CAM_PRIORITY_NONE; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("calling periph_start()\n")); periph->periph_start(periph, ccb); } ccb = NULL; } if (ccb != NULL) xpt_release_ccb(ccb); periph->periph_allocating = 0; cam_periph_release_locked(periph); } static void xpt_run_devq(struct cam_devq *devq) { struct mtx *mtx; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_devq\n")); devq->send_queue.qfrozen_cnt++; while ((devq->send_queue.entries > 0) && (devq->send_openings > 0) && (devq->send_queue.qfrozen_cnt <= 1)) { struct cam_ed *device; union ccb *work_ccb; struct cam_sim *sim; struct xpt_proto *proto; device = (struct cam_ed *)camq_remove(&devq->send_queue, CAMQ_HEAD); CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("running device %p\n", device)); work_ccb = cam_ccbq_peek_ccb(&device->ccbq, CAMQ_HEAD); if (work_ccb == NULL) { printf("device on run queue with no ccbs???\n"); continue; } if ((work_ccb->ccb_h.flags & CAM_HIGH_POWER) != 0) { mtx_lock(&xsoftc.xpt_highpower_lock); if (xsoftc.num_highpower <= 0) { /* * We got a high power command, but we * don't have any available slots. Freeze * the device queue until we have a slot * available. */ xpt_freeze_devq_device(device, 1); STAILQ_INSERT_TAIL(&xsoftc.highpowerq, device, highpowerq_entry); mtx_unlock(&xsoftc.xpt_highpower_lock); continue; } else { /* * Consume a high power slot while * this ccb runs. */ xsoftc.num_highpower--; } mtx_unlock(&xsoftc.xpt_highpower_lock); } cam_ccbq_remove_ccb(&device->ccbq, work_ccb); cam_ccbq_send_ccb(&device->ccbq, work_ccb); devq->send_openings--; devq->send_active++; xpt_schedule_devq(devq, device); mtx_unlock(&devq->send_mtx); if ((work_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0) { /* * The client wants to freeze the queue * after this CCB is sent. */ xpt_freeze_devq(work_ccb->ccb_h.path, 1); } /* In Target mode, the peripheral driver knows best... */ if (work_ccb->ccb_h.func_code == XPT_SCSI_IO) { if ((device->inq_flags & SID_CmdQue) != 0 && work_ccb->csio.tag_action != CAM_TAG_ACTION_NONE) work_ccb->ccb_h.flags |= CAM_TAG_ACTION_VALID; else /* * Clear this in case of a retried CCB that * failed due to a rejected tag. */ work_ccb->ccb_h.flags &= ~CAM_TAG_ACTION_VALID; } KASSERT(device == work_ccb->ccb_h.path->device, ("device (%p) / path->device (%p) mismatch", device, work_ccb->ccb_h.path->device)); proto = xpt_proto_find(device->protocol); if (proto && proto->ops->debug_out) proto->ops->debug_out(work_ccb); /* * Device queues can be shared among multiple SIM instances * that reside on different buses. Use the SIM from the * queued device, rather than the one from the calling bus. */ sim = device->sim; mtx = sim->mtx; if (mtx && !mtx_owned(mtx)) mtx_lock(mtx); else mtx = NULL; work_ccb->ccb_h.qos.periph_data = cam_iosched_now(); (*(sim->sim_action))(sim, work_ccb); if (mtx) mtx_unlock(mtx); mtx_lock(&devq->send_mtx); } devq->send_queue.qfrozen_cnt--; } /* * This function merges stuff from the slave ccb into the master ccb, while * keeping important fields in the master ccb constant. */ void xpt_merge_ccb(union ccb *master_ccb, union ccb *slave_ccb) { /* * Pull fields that are valid for peripheral drivers to set * into the master CCB along with the CCB "payload". */ master_ccb->ccb_h.retry_count = slave_ccb->ccb_h.retry_count; master_ccb->ccb_h.func_code = slave_ccb->ccb_h.func_code; master_ccb->ccb_h.timeout = slave_ccb->ccb_h.timeout; master_ccb->ccb_h.flags = slave_ccb->ccb_h.flags; bcopy(&(&slave_ccb->ccb_h)[1], &(&master_ccb->ccb_h)[1], sizeof(union ccb) - sizeof(struct ccb_hdr)); } void xpt_setup_ccb_flags(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority, u_int32_t flags) { CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_setup_ccb\n")); ccb_h->pinfo.priority = priority; ccb_h->path = path; ccb_h->path_id = path->bus->path_id; if (path->target) ccb_h->target_id = path->target->target_id; else ccb_h->target_id = CAM_TARGET_WILDCARD; if (path->device) { ccb_h->target_lun = path->device->lun_id; ccb_h->pinfo.generation = ++path->device->ccbq.queue.generation; } else { ccb_h->target_lun = CAM_TARGET_WILDCARD; } ccb_h->pinfo.index = CAM_UNQUEUED_INDEX; ccb_h->flags = flags; ccb_h->xflags = 0; } void xpt_setup_ccb(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority) { xpt_setup_ccb_flags(ccb_h, path, priority, /*flags*/ 0); } /* Path manipulation functions */ cam_status xpt_create_path(struct cam_path **new_path_ptr, struct cam_periph *perph, path_id_t path_id, target_id_t target_id, lun_id_t lun_id) { struct cam_path *path; cam_status status; path = (struct cam_path *)malloc(sizeof(*path), M_CAMPATH, M_NOWAIT); if (path == NULL) { status = CAM_RESRC_UNAVAIL; return(status); } status = xpt_compile_path(path, perph, path_id, target_id, lun_id); if (status != CAM_REQ_CMP) { free(path, M_CAMPATH); path = NULL; } *new_path_ptr = path; return (status); } cam_status xpt_create_path_unlocked(struct cam_path **new_path_ptr, struct cam_periph *periph, path_id_t path_id, target_id_t target_id, lun_id_t lun_id) { return (xpt_create_path(new_path_ptr, periph, path_id, target_id, lun_id)); } cam_status xpt_compile_path(struct cam_path *new_path, struct cam_periph *perph, path_id_t path_id, target_id_t target_id, lun_id_t lun_id) { struct cam_eb *bus; struct cam_et *target; struct cam_ed *device; cam_status status; status = CAM_REQ_CMP; /* Completed without error */ target = NULL; /* Wildcarded */ device = NULL; /* Wildcarded */ /* * We will potentially modify the EDT, so block interrupts * that may attempt to create cam paths. */ bus = xpt_find_bus(path_id); if (bus == NULL) { status = CAM_PATH_INVALID; } else { xpt_lock_buses(); mtx_lock(&bus->eb_mtx); target = xpt_find_target(bus, target_id); if (target == NULL) { /* Create one */ struct cam_et *new_target; new_target = xpt_alloc_target(bus, target_id); if (new_target == NULL) { status = CAM_RESRC_UNAVAIL; } else { target = new_target; } } xpt_unlock_buses(); if (target != NULL) { device = xpt_find_device(target, lun_id); if (device == NULL) { /* Create one */ struct cam_ed *new_device; new_device = (*(bus->xport->ops->alloc_device))(bus, target, lun_id); if (new_device == NULL) { status = CAM_RESRC_UNAVAIL; } else { device = new_device; } } } mtx_unlock(&bus->eb_mtx); } /* * Only touch the user's data if we are successful. */ if (status == CAM_REQ_CMP) { new_path->periph = perph; new_path->bus = bus; new_path->target = target; new_path->device = device; CAM_DEBUG(new_path, CAM_DEBUG_TRACE, ("xpt_compile_path\n")); } else { if (device != NULL) xpt_release_device(device); if (target != NULL) xpt_release_target(target); if (bus != NULL) xpt_release_bus(bus); } return (status); } cam_status xpt_clone_path(struct cam_path **new_path_ptr, struct cam_path *path) { struct cam_path *new_path; new_path = (struct cam_path *)malloc(sizeof(*path), M_CAMPATH, M_NOWAIT); if (new_path == NULL) return(CAM_RESRC_UNAVAIL); xpt_copy_path(new_path, path); *new_path_ptr = new_path; return (CAM_REQ_CMP); } void xpt_copy_path(struct cam_path *new_path, struct cam_path *path) { *new_path = *path; if (path->bus != NULL) xpt_acquire_bus(path->bus); if (path->target != NULL) xpt_acquire_target(path->target); if (path->device != NULL) xpt_acquire_device(path->device); } void xpt_release_path(struct cam_path *path) { CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_path\n")); if (path->device != NULL) { xpt_release_device(path->device); path->device = NULL; } if (path->target != NULL) { xpt_release_target(path->target); path->target = NULL; } if (path->bus != NULL) { xpt_release_bus(path->bus); path->bus = NULL; } } void xpt_free_path(struct cam_path *path) { CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_free_path\n")); xpt_release_path(path); free(path, M_CAMPATH); } void xpt_path_counts(struct cam_path *path, uint32_t *bus_ref, uint32_t *periph_ref, uint32_t *target_ref, uint32_t *device_ref) { xpt_lock_buses(); if (bus_ref) { if (path->bus) *bus_ref = path->bus->refcount; else *bus_ref = 0; } if (periph_ref) { if (path->periph) *periph_ref = path->periph->refcount; else *periph_ref = 0; } xpt_unlock_buses(); if (target_ref) { if (path->target) *target_ref = path->target->refcount; else *target_ref = 0; } if (device_ref) { if (path->device) *device_ref = path->device->refcount; else *device_ref = 0; } } /* * Return -1 for failure, 0 for exact match, 1 for match with wildcards * in path1, 2 for match with wildcards in path2. */ int xpt_path_comp(struct cam_path *path1, struct cam_path *path2) { int retval = 0; if (path1->bus != path2->bus) { if (path1->bus->path_id == CAM_BUS_WILDCARD) retval = 1; else if (path2->bus->path_id == CAM_BUS_WILDCARD) retval = 2; else return (-1); } if (path1->target != path2->target) { if (path1->target->target_id == CAM_TARGET_WILDCARD) { if (retval == 0) retval = 1; } else if (path2->target->target_id == CAM_TARGET_WILDCARD) retval = 2; else return (-1); } if (path1->device != path2->device) { if (path1->device->lun_id == CAM_LUN_WILDCARD) { if (retval == 0) retval = 1; } else if (path2->device->lun_id == CAM_LUN_WILDCARD) retval = 2; else return (-1); } return (retval); } int xpt_path_comp_dev(struct cam_path *path, struct cam_ed *dev) { int retval = 0; if (path->bus != dev->target->bus) { if (path->bus->path_id == CAM_BUS_WILDCARD) retval = 1; else if (dev->target->bus->path_id == CAM_BUS_WILDCARD) retval = 2; else return (-1); } if (path->target != dev->target) { if (path->target->target_id == CAM_TARGET_WILDCARD) { if (retval == 0) retval = 1; } else if (dev->target->target_id == CAM_TARGET_WILDCARD) retval = 2; else return (-1); } if (path->device != dev) { if (path->device->lun_id == CAM_LUN_WILDCARD) { if (retval == 0) retval = 1; } else if (dev->lun_id == CAM_LUN_WILDCARD) retval = 2; else return (-1); } return (retval); } void xpt_print_path(struct cam_path *path) { struct sbuf sb; char buffer[XPT_PRINT_LEN]; sbuf_new(&sb, buffer, XPT_PRINT_LEN, SBUF_FIXEDLEN); xpt_path_sbuf(path, &sb); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); sbuf_delete(&sb); } void xpt_print_device(struct cam_ed *device) { if (device == NULL) printf("(nopath): "); else { printf("(noperiph:%s%d:%d:%d:%jx): ", device->sim->sim_name, device->sim->unit_number, device->sim->bus_id, device->target->target_id, (uintmax_t)device->lun_id); } } void xpt_print(struct cam_path *path, const char *fmt, ...) { va_list ap; struct sbuf sb; char buffer[XPT_PRINT_LEN]; sbuf_new(&sb, buffer, XPT_PRINT_LEN, SBUF_FIXEDLEN); xpt_path_sbuf(path, &sb); va_start(ap, fmt); sbuf_vprintf(&sb, fmt, ap); va_end(ap); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); sbuf_delete(&sb); } int xpt_path_string(struct cam_path *path, char *str, size_t str_len) { struct sbuf sb; int len; sbuf_new(&sb, str, str_len, 0); len = xpt_path_sbuf(path, &sb); sbuf_finish(&sb); return (len); } int xpt_path_sbuf(struct cam_path *path, struct sbuf *sb) { if (path == NULL) sbuf_printf(sb, "(nopath): "); else { if (path->periph != NULL) sbuf_printf(sb, "(%s%d:", path->periph->periph_name, path->periph->unit_number); else sbuf_printf(sb, "(noperiph:"); if (path->bus != NULL) sbuf_printf(sb, "%s%d:%d:", path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id); else sbuf_printf(sb, "nobus:"); if (path->target != NULL) sbuf_printf(sb, "%d:", path->target->target_id); else sbuf_printf(sb, "X:"); if (path->device != NULL) sbuf_printf(sb, "%jx): ", (uintmax_t)path->device->lun_id); else sbuf_printf(sb, "X): "); } return(sbuf_len(sb)); } path_id_t xpt_path_path_id(struct cam_path *path) { return(path->bus->path_id); } target_id_t xpt_path_target_id(struct cam_path *path) { if (path->target != NULL) return (path->target->target_id); else return (CAM_TARGET_WILDCARD); } lun_id_t xpt_path_lun_id(struct cam_path *path) { if (path->device != NULL) return (path->device->lun_id); else return (CAM_LUN_WILDCARD); } struct cam_sim * xpt_path_sim(struct cam_path *path) { return (path->bus->sim); } struct cam_periph* xpt_path_periph(struct cam_path *path) { return (path->periph); } /* * Release a CAM control block for the caller. Remit the cost of the structure * to the device referenced by the path. If the this device had no 'credits' * and peripheral drivers have registered async callbacks for this notification * call them now. */ void xpt_release_ccb(union ccb *free_ccb) { struct cam_ed *device; struct cam_periph *periph; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_release_ccb\n")); xpt_path_assert(free_ccb->ccb_h.path, MA_OWNED); device = free_ccb->ccb_h.path->device; periph = free_ccb->ccb_h.path->periph; xpt_free_ccb(free_ccb); periph->periph_allocated--; cam_ccbq_release_opening(&device->ccbq); xpt_run_allocq(periph, 0); } /* Functions accessed by SIM drivers */ static struct xpt_xport_ops xport_default_ops = { .alloc_device = xpt_alloc_device_default, .action = xpt_action_default, .async = xpt_dev_async_default, }; static struct xpt_xport xport_default = { .xport = XPORT_UNKNOWN, .name = "unknown", .ops = &xport_default_ops, }; CAM_XPT_XPORT(xport_default); /* * A sim structure, listing the SIM entry points and instance * identification info is passed to xpt_bus_register to hook the SIM * into the CAM framework. xpt_bus_register creates a cam_eb entry * for this new bus and places it in the array of buses and assigns * it a path_id. The path_id may be influenced by "hard wiring" * information specified by the user. Once interrupt services are * available, the bus will be probed. */ int32_t xpt_bus_register(struct cam_sim *sim, device_t parent, u_int32_t bus) { struct cam_eb *new_bus; struct cam_eb *old_bus; struct ccb_pathinq cpi; struct cam_path *path; cam_status status; sim->bus_id = bus; new_bus = (struct cam_eb *)malloc(sizeof(*new_bus), M_CAMXPT, M_NOWAIT|M_ZERO); if (new_bus == NULL) { /* Couldn't satisfy request */ return (CAM_RESRC_UNAVAIL); } mtx_init(&new_bus->eb_mtx, "CAM bus lock", NULL, MTX_DEF); TAILQ_INIT(&new_bus->et_entries); cam_sim_hold(sim); new_bus->sim = sim; timevalclear(&new_bus->last_reset); new_bus->flags = 0; new_bus->refcount = 1; /* Held until a bus_deregister event */ new_bus->generation = 0; xpt_lock_buses(); sim->path_id = new_bus->path_id = xptpathid(sim->sim_name, sim->unit_number, sim->bus_id); old_bus = TAILQ_FIRST(&xsoftc.xpt_busses); while (old_bus != NULL && old_bus->path_id < new_bus->path_id) old_bus = TAILQ_NEXT(old_bus, links); if (old_bus != NULL) TAILQ_INSERT_BEFORE(old_bus, new_bus, links); else TAILQ_INSERT_TAIL(&xsoftc.xpt_busses, new_bus, links); xsoftc.bus_generation++; xpt_unlock_buses(); /* * Set a default transport so that a PATH_INQ can be issued to * the SIM. This will then allow for probing and attaching of * a more appropriate transport. */ new_bus->xport = &xport_default; status = xpt_create_path(&path, /*periph*/NULL, sim->path_id, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (status != CAM_REQ_CMP) { xpt_release_bus(new_bus); return (CAM_RESRC_UNAVAIL); } xpt_path_inq(&cpi, path); if (cpi.ccb_h.status == CAM_REQ_CMP) { struct xpt_xport **xpt; SET_FOREACH(xpt, cam_xpt_xport_set) { if ((*xpt)->xport == cpi.transport) { new_bus->xport = *xpt; break; } } if (new_bus->xport == NULL) { xpt_print(path, "No transport found for %d\n", cpi.transport); xpt_release_bus(new_bus); free(path, M_CAMXPT); return (CAM_RESRC_UNAVAIL); } } /* Notify interested parties */ if (sim->path_id != CAM_XPT_PATH_ID) { xpt_async(AC_PATH_REGISTERED, path, &cpi); if ((cpi.hba_misc & PIM_NOSCAN) == 0) { union ccb *scan_ccb; /* Initiate bus rescan. */ scan_ccb = xpt_alloc_ccb_nowait(); if (scan_ccb != NULL) { scan_ccb->ccb_h.path = path; scan_ccb->ccb_h.func_code = XPT_SCAN_BUS; scan_ccb->crcn.flags = 0; xpt_rescan(scan_ccb); } else { xpt_print(path, "Can't allocate CCB to scan bus\n"); xpt_free_path(path); } } else xpt_free_path(path); } else xpt_free_path(path); return (CAM_SUCCESS); } int32_t xpt_bus_deregister(path_id_t pathid) { struct cam_path bus_path; cam_status status; status = xpt_compile_path(&bus_path, NULL, pathid, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (status != CAM_REQ_CMP) return (status); xpt_async(AC_LOST_DEVICE, &bus_path, NULL); xpt_async(AC_PATH_DEREGISTERED, &bus_path, NULL); /* Release the reference count held while registered. */ xpt_release_bus(bus_path.bus); xpt_release_path(&bus_path); return (CAM_REQ_CMP); } static path_id_t xptnextfreepathid(void) { struct cam_eb *bus; path_id_t pathid; const char *strval; mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED); pathid = 0; bus = TAILQ_FIRST(&xsoftc.xpt_busses); retry: /* Find an unoccupied pathid */ while (bus != NULL && bus->path_id <= pathid) { if (bus->path_id == pathid) pathid++; bus = TAILQ_NEXT(bus, links); } /* * Ensure that this pathid is not reserved for * a bus that may be registered in the future. */ if (resource_string_value("scbus", pathid, "at", &strval) == 0) { ++pathid; /* Start the search over */ goto retry; } return (pathid); } static path_id_t xptpathid(const char *sim_name, int sim_unit, int sim_bus) { path_id_t pathid; int i, dunit, val; char buf[32]; const char *dname; pathid = CAM_XPT_PATH_ID; snprintf(buf, sizeof(buf), "%s%d", sim_name, sim_unit); if (strcmp(buf, "xpt0") == 0 && sim_bus == 0) return (pathid); i = 0; while ((resource_find_match(&i, &dname, &dunit, "at", buf)) == 0) { if (strcmp(dname, "scbus")) { /* Avoid a bit of foot shooting. */ continue; } if (dunit < 0) /* unwired?! */ continue; if (resource_int_value("scbus", dunit, "bus", &val) == 0) { if (sim_bus == val) { pathid = dunit; break; } } else if (sim_bus == 0) { /* Unspecified matches bus 0 */ pathid = dunit; break; } else { printf("Ambiguous scbus configuration for %s%d " "bus %d, cannot wire down. The kernel " "config entry for scbus%d should " "specify a controller bus.\n" "Scbus will be assigned dynamically.\n", sim_name, sim_unit, sim_bus, dunit); break; } } if (pathid == CAM_XPT_PATH_ID) pathid = xptnextfreepathid(); return (pathid); } static const char * xpt_async_string(u_int32_t async_code) { switch (async_code) { case AC_BUS_RESET: return ("AC_BUS_RESET"); case AC_UNSOL_RESEL: return ("AC_UNSOL_RESEL"); case AC_SCSI_AEN: return ("AC_SCSI_AEN"); case AC_SENT_BDR: return ("AC_SENT_BDR"); case AC_PATH_REGISTERED: return ("AC_PATH_REGISTERED"); case AC_PATH_DEREGISTERED: return ("AC_PATH_DEREGISTERED"); case AC_FOUND_DEVICE: return ("AC_FOUND_DEVICE"); case AC_LOST_DEVICE: return ("AC_LOST_DEVICE"); case AC_TRANSFER_NEG: return ("AC_TRANSFER_NEG"); case AC_INQ_CHANGED: return ("AC_INQ_CHANGED"); case AC_GETDEV_CHANGED: return ("AC_GETDEV_CHANGED"); case AC_CONTRACT: return ("AC_CONTRACT"); case AC_ADVINFO_CHANGED: return ("AC_ADVINFO_CHANGED"); case AC_UNIT_ATTENTION: return ("AC_UNIT_ATTENTION"); } return ("AC_UNKNOWN"); } static int xpt_async_size(u_int32_t async_code) { switch (async_code) { case AC_BUS_RESET: return (0); case AC_UNSOL_RESEL: return (0); case AC_SCSI_AEN: return (0); case AC_SENT_BDR: return (0); case AC_PATH_REGISTERED: return (sizeof(struct ccb_pathinq)); case AC_PATH_DEREGISTERED: return (0); case AC_FOUND_DEVICE: return (sizeof(struct ccb_getdev)); case AC_LOST_DEVICE: return (0); case AC_TRANSFER_NEG: return (sizeof(struct ccb_trans_settings)); case AC_INQ_CHANGED: return (0); case AC_GETDEV_CHANGED: return (0); case AC_CONTRACT: return (sizeof(struct ac_contract)); case AC_ADVINFO_CHANGED: return (-1); case AC_UNIT_ATTENTION: return (sizeof(struct ccb_scsiio)); } return (0); } static int xpt_async_process_dev(struct cam_ed *device, void *arg) { union ccb *ccb = arg; struct cam_path *path = ccb->ccb_h.path; void *async_arg = ccb->casync.async_arg_ptr; u_int32_t async_code = ccb->casync.async_code; int relock; if (path->device != device && path->device->lun_id != CAM_LUN_WILDCARD && device->lun_id != CAM_LUN_WILDCARD) return (1); /* * The async callback could free the device. * If it is a broadcast async, it doesn't hold * device reference, so take our own reference. */ xpt_acquire_device(device); /* * If async for specific device is to be delivered to * the wildcard client, take the specific device lock. * XXX: We may need a way for client to specify it. */ if ((device->lun_id == CAM_LUN_WILDCARD && path->device->lun_id != CAM_LUN_WILDCARD) || (device->target->target_id == CAM_TARGET_WILDCARD && path->target->target_id != CAM_TARGET_WILDCARD) || (device->target->bus->path_id == CAM_BUS_WILDCARD && path->target->bus->path_id != CAM_BUS_WILDCARD)) { mtx_unlock(&device->device_mtx); xpt_path_lock(path); relock = 1; } else relock = 0; (*(device->target->bus->xport->ops->async))(async_code, device->target->bus, device->target, device, async_arg); xpt_async_bcast(&device->asyncs, async_code, path, async_arg); if (relock) { xpt_path_unlock(path); mtx_lock(&device->device_mtx); } xpt_release_device(device); return (1); } static int xpt_async_process_tgt(struct cam_et *target, void *arg) { union ccb *ccb = arg; struct cam_path *path = ccb->ccb_h.path; if (path->target != target && path->target->target_id != CAM_TARGET_WILDCARD && target->target_id != CAM_TARGET_WILDCARD) return (1); if (ccb->casync.async_code == AC_SENT_BDR) { /* Update our notion of when the last reset occurred */ microtime(&target->last_reset); } return (xptdevicetraverse(target, NULL, xpt_async_process_dev, ccb)); } static void xpt_async_process(struct cam_periph *periph, union ccb *ccb) { struct cam_eb *bus; struct cam_path *path; void *async_arg; u_int32_t async_code; path = ccb->ccb_h.path; async_code = ccb->casync.async_code; async_arg = ccb->casync.async_arg_ptr; CAM_DEBUG(path, CAM_DEBUG_TRACE | CAM_DEBUG_INFO, ("xpt_async(%s)\n", xpt_async_string(async_code))); bus = path->bus; if (async_code == AC_BUS_RESET) { /* Update our notion of when the last reset occurred */ microtime(&bus->last_reset); } xpttargettraverse(bus, NULL, xpt_async_process_tgt, ccb); /* * If this wasn't a fully wildcarded async, tell all * clients that want all async events. */ if (bus != xpt_periph->path->bus) { xpt_path_lock(xpt_periph->path); xpt_async_process_dev(xpt_periph->path->device, ccb); xpt_path_unlock(xpt_periph->path); } if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD) xpt_release_devq(path, 1, TRUE); else xpt_release_simq(path->bus->sim, TRUE); if (ccb->casync.async_arg_size > 0) free(async_arg, M_CAMXPT); xpt_free_path(path); xpt_free_ccb(ccb); } static void xpt_async_bcast(struct async_list *async_head, u_int32_t async_code, struct cam_path *path, void *async_arg) { struct async_node *cur_entry; struct mtx *mtx; cur_entry = SLIST_FIRST(async_head); while (cur_entry != NULL) { struct async_node *next_entry; /* * Grab the next list entry before we call the current * entry's callback. This is because the callback function * can delete its async callback entry. */ next_entry = SLIST_NEXT(cur_entry, links); if ((cur_entry->event_enable & async_code) != 0) { mtx = cur_entry->event_lock ? path->device->sim->mtx : NULL; if (mtx) mtx_lock(mtx); cur_entry->callback(cur_entry->callback_arg, async_code, path, async_arg); if (mtx) mtx_unlock(mtx); } cur_entry = next_entry; } } void xpt_async(u_int32_t async_code, struct cam_path *path, void *async_arg) { union ccb *ccb; int size; ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { xpt_print(path, "Can't allocate CCB to send %s\n", xpt_async_string(async_code)); return; } if (xpt_clone_path(&ccb->ccb_h.path, path) != CAM_REQ_CMP) { xpt_print(path, "Can't allocate path to send %s\n", xpt_async_string(async_code)); xpt_free_ccb(ccb); return; } ccb->ccb_h.path->periph = NULL; ccb->ccb_h.func_code = XPT_ASYNC; ccb->ccb_h.cbfcnp = xpt_async_process; ccb->ccb_h.flags |= CAM_UNLOCKED; ccb->casync.async_code = async_code; ccb->casync.async_arg_size = 0; size = xpt_async_size(async_code); CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_async: func %#x %s aync_code %d %s\n", ccb->ccb_h.func_code, xpt_action_name(ccb->ccb_h.func_code), async_code, xpt_async_string(async_code))); if (size > 0 && async_arg != NULL) { ccb->casync.async_arg_ptr = malloc(size, M_CAMXPT, M_NOWAIT); if (ccb->casync.async_arg_ptr == NULL) { xpt_print(path, "Can't allocate argument to send %s\n", xpt_async_string(async_code)); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); return; } memcpy(ccb->casync.async_arg_ptr, async_arg, size); ccb->casync.async_arg_size = size; } else if (size < 0) { ccb->casync.async_arg_ptr = async_arg; ccb->casync.async_arg_size = size; } if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD) xpt_freeze_devq(path, 1); else xpt_freeze_simq(path->bus->sim, 1); xpt_done(ccb); } static void xpt_dev_async_default(u_int32_t async_code, struct cam_eb *bus, struct cam_et *target, struct cam_ed *device, void *async_arg) { /* * We only need to handle events for real devices. */ if (target->target_id == CAM_TARGET_WILDCARD || device->lun_id == CAM_LUN_WILDCARD) return; printf("%s called\n", __func__); } static uint32_t xpt_freeze_devq_device(struct cam_ed *dev, u_int count) { struct cam_devq *devq; uint32_t freeze; devq = dev->sim->devq; mtx_assert(&devq->send_mtx, MA_OWNED); CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_freeze_devq_device(%d) %u->%u\n", count, dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt + count)); freeze = (dev->ccbq.queue.qfrozen_cnt += count); /* Remove frozen device from sendq. */ if (device_is_queued(dev)) camq_remove(&devq->send_queue, dev->devq_entry.index); return (freeze); } u_int32_t xpt_freeze_devq(struct cam_path *path, u_int count) { struct cam_ed *dev = path->device; struct cam_devq *devq; uint32_t freeze; devq = dev->sim->devq; mtx_lock(&devq->send_mtx); CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_freeze_devq(%d)\n", count)); freeze = xpt_freeze_devq_device(dev, count); mtx_unlock(&devq->send_mtx); return (freeze); } u_int32_t xpt_freeze_simq(struct cam_sim *sim, u_int count) { struct cam_devq *devq; uint32_t freeze; devq = sim->devq; mtx_lock(&devq->send_mtx); freeze = (devq->send_queue.qfrozen_cnt += count); mtx_unlock(&devq->send_mtx); return (freeze); } static void xpt_release_devq_timeout(void *arg) { struct cam_ed *dev; struct cam_devq *devq; dev = (struct cam_ed *)arg; CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_release_devq_timeout\n")); devq = dev->sim->devq; mtx_assert(&devq->send_mtx, MA_OWNED); if (xpt_release_devq_device(dev, /*count*/1, /*run_queue*/TRUE)) xpt_run_devq(devq); } void xpt_release_devq(struct cam_path *path, u_int count, int run_queue) { struct cam_ed *dev; struct cam_devq *devq; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_devq(%d, %d)\n", count, run_queue)); dev = path->device; devq = dev->sim->devq; mtx_lock(&devq->send_mtx); if (xpt_release_devq_device(dev, count, run_queue)) xpt_run_devq(dev->sim->devq); mtx_unlock(&devq->send_mtx); } static int xpt_release_devq_device(struct cam_ed *dev, u_int count, int run_queue) { mtx_assert(&dev->sim->devq->send_mtx, MA_OWNED); CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_release_devq_device(%d, %d) %u->%u\n", count, run_queue, dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt - count)); if (count > dev->ccbq.queue.qfrozen_cnt) { #ifdef INVARIANTS printf("xpt_release_devq(): requested %u > present %u\n", count, dev->ccbq.queue.qfrozen_cnt); #endif count = dev->ccbq.queue.qfrozen_cnt; } dev->ccbq.queue.qfrozen_cnt -= count; if (dev->ccbq.queue.qfrozen_cnt == 0) { /* * No longer need to wait for a successful * command completion. */ dev->flags &= ~CAM_DEV_REL_ON_COMPLETE; /* * Remove any timeouts that might be scheduled * to release this queue. */ if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) { callout_stop(&dev->callout); dev->flags &= ~CAM_DEV_REL_TIMEOUT_PENDING; } /* * Now that we are unfrozen schedule the * device so any pending transactions are * run. */ xpt_schedule_devq(dev->sim->devq, dev); } else run_queue = 0; return (run_queue); } void xpt_release_simq(struct cam_sim *sim, int run_queue) { struct cam_devq *devq; devq = sim->devq; mtx_lock(&devq->send_mtx); if (devq->send_queue.qfrozen_cnt <= 0) { #ifdef INVARIANTS printf("xpt_release_simq: requested 1 > present %u\n", devq->send_queue.qfrozen_cnt); #endif } else devq->send_queue.qfrozen_cnt--; if (devq->send_queue.qfrozen_cnt == 0) { /* * If there is a timeout scheduled to release this * sim queue, remove it. The queue frozen count is * already at 0. */ if ((sim->flags & CAM_SIM_REL_TIMEOUT_PENDING) != 0){ callout_stop(&sim->callout); sim->flags &= ~CAM_SIM_REL_TIMEOUT_PENDING; } if (run_queue) { /* * Now that we are unfrozen run the send queue. */ xpt_run_devq(sim->devq); } } mtx_unlock(&devq->send_mtx); } /* * XXX Appears to be unused. */ static void xpt_release_simq_timeout(void *arg) { struct cam_sim *sim; sim = (struct cam_sim *)arg; xpt_release_simq(sim, /* run_queue */ TRUE); } void xpt_done(union ccb *done_ccb) { struct cam_doneq *queue; int run, hash; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) if (done_ccb->ccb_h.func_code == XPT_SCSI_IO && done_ccb->csio.bio != NULL) biotrack(done_ccb->csio.bio, __func__); #endif CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_done: func= %#x %s status %#x\n", done_ccb->ccb_h.func_code, xpt_action_name(done_ccb->ccb_h.func_code), done_ccb->ccb_h.status)); if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0) return; /* Store the time the ccb was in the sim */ done_ccb->ccb_h.qos.periph_data = cam_iosched_delta_t(done_ccb->ccb_h.qos.periph_data); hash = (done_ccb->ccb_h.path_id + done_ccb->ccb_h.target_id + done_ccb->ccb_h.target_lun) % cam_num_doneqs; queue = &cam_doneqs[hash]; mtx_lock(&queue->cam_doneq_mtx); run = (queue->cam_doneq_sleep && STAILQ_EMPTY(&queue->cam_doneq)); STAILQ_INSERT_TAIL(&queue->cam_doneq, &done_ccb->ccb_h, sim_links.stqe); done_ccb->ccb_h.pinfo.index = CAM_DONEQ_INDEX; mtx_unlock(&queue->cam_doneq_mtx); if (run) wakeup(&queue->cam_doneq); } void xpt_done_direct(union ccb *done_ccb) { CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_done_direct: status %#x\n", done_ccb->ccb_h.status)); if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0) return; /* Store the time the ccb was in the sim */ done_ccb->ccb_h.qos.periph_data = cam_iosched_delta_t(done_ccb->ccb_h.qos.periph_data); xpt_done_process(&done_ccb->ccb_h); } union ccb * xpt_alloc_ccb() { union ccb *new_ccb; new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_WAITOK); return (new_ccb); } union ccb * xpt_alloc_ccb_nowait() { union ccb *new_ccb; new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_NOWAIT); return (new_ccb); } void xpt_free_ccb(union ccb *free_ccb) { free(free_ccb, M_CAMCCB); } /* Private XPT functions */ /* * Get a CAM control block for the caller. Charge the structure to the device * referenced by the path. If we don't have sufficient resources to allocate * more ccbs, we return NULL. */ static union ccb * xpt_get_ccb_nowait(struct cam_periph *periph) { union ccb *new_ccb; new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_NOWAIT); if (new_ccb == NULL) return (NULL); periph->periph_allocated++; cam_ccbq_take_opening(&periph->path->device->ccbq); return (new_ccb); } static union ccb * xpt_get_ccb(struct cam_periph *periph) { union ccb *new_ccb; cam_periph_unlock(periph); new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_WAITOK); cam_periph_lock(periph); periph->periph_allocated++; cam_ccbq_take_opening(&periph->path->device->ccbq); return (new_ccb); } union ccb * cam_periph_getccb(struct cam_periph *periph, u_int32_t priority) { struct ccb_hdr *ccb_h; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("cam_periph_getccb\n")); cam_periph_assert(periph, MA_OWNED); while ((ccb_h = SLIST_FIRST(&periph->ccb_list)) == NULL || ccb_h->pinfo.priority != priority) { if (priority < periph->immediate_priority) { periph->immediate_priority = priority; xpt_run_allocq(periph, 0); } else cam_periph_sleep(periph, &periph->ccb_list, PRIBIO, "cgticb", 0); } SLIST_REMOVE_HEAD(&periph->ccb_list, periph_links.sle); return ((union ccb *)ccb_h); } static void xpt_acquire_bus(struct cam_eb *bus) { xpt_lock_buses(); bus->refcount++; xpt_unlock_buses(); } static void xpt_release_bus(struct cam_eb *bus) { xpt_lock_buses(); KASSERT(bus->refcount >= 1, ("bus->refcount >= 1")); if (--bus->refcount > 0) { xpt_unlock_buses(); return; } TAILQ_REMOVE(&xsoftc.xpt_busses, bus, links); xsoftc.bus_generation++; xpt_unlock_buses(); KASSERT(TAILQ_EMPTY(&bus->et_entries), ("destroying bus, but target list is not empty")); cam_sim_release(bus->sim); mtx_destroy(&bus->eb_mtx); free(bus, M_CAMXPT); } static struct cam_et * xpt_alloc_target(struct cam_eb *bus, target_id_t target_id) { struct cam_et *cur_target, *target; mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED); mtx_assert(&bus->eb_mtx, MA_OWNED); target = (struct cam_et *)malloc(sizeof(*target), M_CAMXPT, M_NOWAIT|M_ZERO); if (target == NULL) return (NULL); TAILQ_INIT(&target->ed_entries); target->bus = bus; target->target_id = target_id; target->refcount = 1; target->generation = 0; target->luns = NULL; mtx_init(&target->luns_mtx, "CAM LUNs lock", NULL, MTX_DEF); timevalclear(&target->last_reset); /* * Hold a reference to our parent bus so it * will not go away before we do. */ bus->refcount++; /* Insertion sort into our bus's target list */ cur_target = TAILQ_FIRST(&bus->et_entries); while (cur_target != NULL && cur_target->target_id < target_id) cur_target = TAILQ_NEXT(cur_target, links); if (cur_target != NULL) { TAILQ_INSERT_BEFORE(cur_target, target, links); } else { TAILQ_INSERT_TAIL(&bus->et_entries, target, links); } bus->generation++; return (target); } static void xpt_acquire_target(struct cam_et *target) { struct cam_eb *bus = target->bus; mtx_lock(&bus->eb_mtx); target->refcount++; mtx_unlock(&bus->eb_mtx); } static void xpt_release_target(struct cam_et *target) { struct cam_eb *bus = target->bus; mtx_lock(&bus->eb_mtx); if (--target->refcount > 0) { mtx_unlock(&bus->eb_mtx); return; } TAILQ_REMOVE(&bus->et_entries, target, links); bus->generation++; mtx_unlock(&bus->eb_mtx); KASSERT(TAILQ_EMPTY(&target->ed_entries), ("destroying target, but device list is not empty")); xpt_release_bus(bus); mtx_destroy(&target->luns_mtx); if (target->luns) free(target->luns, M_CAMXPT); free(target, M_CAMXPT); } static struct cam_ed * xpt_alloc_device_default(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id) { struct cam_ed *device; device = xpt_alloc_device(bus, target, lun_id); if (device == NULL) return (NULL); device->mintags = 1; device->maxtags = 1; return (device); } static void xpt_destroy_device(void *context, int pending) { struct cam_ed *device = context; mtx_lock(&device->device_mtx); mtx_destroy(&device->device_mtx); free(device, M_CAMDEV); } struct cam_ed * xpt_alloc_device(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id) { struct cam_ed *cur_device, *device; struct cam_devq *devq; cam_status status; mtx_assert(&bus->eb_mtx, MA_OWNED); /* Make space for us in the device queue on our bus */ devq = bus->sim->devq; mtx_lock(&devq->send_mtx); status = cam_devq_resize(devq, devq->send_queue.array_size + 1); mtx_unlock(&devq->send_mtx); if (status != CAM_REQ_CMP) return (NULL); device = (struct cam_ed *)malloc(sizeof(*device), M_CAMDEV, M_NOWAIT|M_ZERO); if (device == NULL) return (NULL); cam_init_pinfo(&device->devq_entry); device->target = target; device->lun_id = lun_id; device->sim = bus->sim; if (cam_ccbq_init(&device->ccbq, bus->sim->max_dev_openings) != 0) { free(device, M_CAMDEV); return (NULL); } SLIST_INIT(&device->asyncs); SLIST_INIT(&device->periphs); device->generation = 0; device->flags = CAM_DEV_UNCONFIGURED; device->tag_delay_count = 0; device->tag_saved_openings = 0; device->refcount = 1; mtx_init(&device->device_mtx, "CAM device lock", NULL, MTX_DEF); callout_init_mtx(&device->callout, &devq->send_mtx, 0); TASK_INIT(&device->device_destroy_task, 0, xpt_destroy_device, device); /* * Hold a reference to our parent bus so it * will not go away before we do. */ target->refcount++; cur_device = TAILQ_FIRST(&target->ed_entries); while (cur_device != NULL && cur_device->lun_id < lun_id) cur_device = TAILQ_NEXT(cur_device, links); if (cur_device != NULL) TAILQ_INSERT_BEFORE(cur_device, device, links); else TAILQ_INSERT_TAIL(&target->ed_entries, device, links); target->generation++; return (device); } void xpt_acquire_device(struct cam_ed *device) { struct cam_eb *bus = device->target->bus; mtx_lock(&bus->eb_mtx); device->refcount++; mtx_unlock(&bus->eb_mtx); } void xpt_release_device(struct cam_ed *device) { struct cam_eb *bus = device->target->bus; struct cam_devq *devq; mtx_lock(&bus->eb_mtx); if (--device->refcount > 0) { mtx_unlock(&bus->eb_mtx); return; } TAILQ_REMOVE(&device->target->ed_entries, device,links); device->target->generation++; mtx_unlock(&bus->eb_mtx); /* Release our slot in the devq */ devq = bus->sim->devq; mtx_lock(&devq->send_mtx); cam_devq_resize(devq, devq->send_queue.array_size - 1); mtx_unlock(&devq->send_mtx); KASSERT(SLIST_EMPTY(&device->periphs), ("destroying device, but periphs list is not empty")); KASSERT(device->devq_entry.index == CAM_UNQUEUED_INDEX, ("destroying device while still queued for ccbs")); if ((device->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) callout_stop(&device->callout); xpt_release_target(device->target); cam_ccbq_fini(&device->ccbq); /* * Free allocated memory. free(9) does nothing if the * supplied pointer is NULL, so it is safe to call without * checking. */ free(device->supported_vpds, M_CAMXPT); free(device->device_id, M_CAMXPT); free(device->ext_inq, M_CAMXPT); free(device->physpath, M_CAMXPT); free(device->rcap_buf, M_CAMXPT); free(device->serial_num, M_CAMXPT); free(device->nvme_data, M_CAMXPT); free(device->nvme_cdata, M_CAMXPT); taskqueue_enqueue(xsoftc.xpt_taskq, &device->device_destroy_task); } u_int32_t xpt_dev_ccbq_resize(struct cam_path *path, int newopenings) { int result; struct cam_ed *dev; dev = path->device; mtx_lock(&dev->sim->devq->send_mtx); result = cam_ccbq_resize(&dev->ccbq, newopenings); mtx_unlock(&dev->sim->devq->send_mtx); if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0 || (dev->inq_flags & SID_CmdQue) != 0) dev->tag_saved_openings = newopenings; return (result); } static struct cam_eb * xpt_find_bus(path_id_t path_id) { struct cam_eb *bus; xpt_lock_buses(); for (bus = TAILQ_FIRST(&xsoftc.xpt_busses); bus != NULL; bus = TAILQ_NEXT(bus, links)) { if (bus->path_id == path_id) { bus->refcount++; break; } } xpt_unlock_buses(); return (bus); } static struct cam_et * xpt_find_target(struct cam_eb *bus, target_id_t target_id) { struct cam_et *target; mtx_assert(&bus->eb_mtx, MA_OWNED); for (target = TAILQ_FIRST(&bus->et_entries); target != NULL; target = TAILQ_NEXT(target, links)) { if (target->target_id == target_id) { target->refcount++; break; } } return (target); } static struct cam_ed * xpt_find_device(struct cam_et *target, lun_id_t lun_id) { struct cam_ed *device; mtx_assert(&target->bus->eb_mtx, MA_OWNED); for (device = TAILQ_FIRST(&target->ed_entries); device != NULL; device = TAILQ_NEXT(device, links)) { if (device->lun_id == lun_id) { device->refcount++; break; } } return (device); } void xpt_start_tags(struct cam_path *path) { struct ccb_relsim crs; struct cam_ed *device; struct cam_sim *sim; int newopenings; device = path->device; sim = path->bus->sim; device->flags &= ~CAM_DEV_TAG_AFTER_COUNT; xpt_freeze_devq(path, /*count*/1); device->inq_flags |= SID_CmdQue; if (device->tag_saved_openings != 0) newopenings = device->tag_saved_openings; else newopenings = min(device->maxtags, sim->max_tagged_dev_openings); xpt_dev_ccbq_resize(path, newopenings); xpt_async(AC_GETDEV_CHANGED, path, NULL); xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL); crs.ccb_h.func_code = XPT_REL_SIMQ; crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY; crs.openings = crs.release_timeout = crs.qfrozen_cnt = 0; xpt_action((union ccb *)&crs); } void xpt_stop_tags(struct cam_path *path) { struct ccb_relsim crs; struct cam_ed *device; struct cam_sim *sim; device = path->device; sim = path->bus->sim; device->flags &= ~CAM_DEV_TAG_AFTER_COUNT; device->tag_delay_count = 0; xpt_freeze_devq(path, /*count*/1); device->inq_flags &= ~SID_CmdQue; xpt_dev_ccbq_resize(path, sim->max_dev_openings); xpt_async(AC_GETDEV_CHANGED, path, NULL); xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL); crs.ccb_h.func_code = XPT_REL_SIMQ; crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY; crs.openings = crs.release_timeout = crs.qfrozen_cnt = 0; xpt_action((union ccb *)&crs); } /* * Assume all possible buses are detected by this time, so allow boot * as soon as they all are scanned. */ static void xpt_boot_delay(void *arg) { xpt_release_boot(); } /* * Now that all config hooks have completed, start boot_delay timer, * waiting for possibly still undetected buses (USB) to appear. */ static void xpt_ch_done(void *arg) { callout_init(&xsoftc.boot_callout, 1); callout_reset_sbt(&xsoftc.boot_callout, SBT_1MS * xsoftc.boot_delay, 0, xpt_boot_delay, NULL, 0); } SYSINIT(xpt_hw_delay, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_ANY, xpt_ch_done, NULL); /* * Now that interrupts are enabled, go find our devices */ static void xpt_config(void *arg) { if (taskqueue_start_threads(&xsoftc.xpt_taskq, 1, PRIBIO, "CAM taskq")) printf("xpt_config: failed to create taskqueue thread.\n"); /* Setup debugging path */ if (cam_dflags != CAM_DEBUG_NONE) { if (xpt_create_path(&cam_dpath, NULL, CAM_DEBUG_BUS, CAM_DEBUG_TARGET, CAM_DEBUG_LUN) != CAM_REQ_CMP) { printf("xpt_config: xpt_create_path() failed for debug" " target %d:%d:%d, debugging disabled\n", CAM_DEBUG_BUS, CAM_DEBUG_TARGET, CAM_DEBUG_LUN); cam_dflags = CAM_DEBUG_NONE; } } else cam_dpath = NULL; periphdriver_init(1); xpt_hold_boot(); /* Fire up rescan thread. */ if (kproc_kthread_add(xpt_scanner_thread, NULL, &cam_proc, NULL, 0, 0, "cam", "scanner")) { printf("xpt_config: failed to create rescan thread.\n"); } } void xpt_hold_boot_locked(void) { if (xsoftc.buses_to_config++ == 0) root_mount_hold_token("CAM", &xsoftc.xpt_rootmount); } void xpt_hold_boot(void) { xpt_lock_buses(); xpt_hold_boot_locked(); xpt_unlock_buses(); } void xpt_release_boot(void) { xpt_lock_buses(); if (--xsoftc.buses_to_config == 0) { if (xsoftc.buses_config_done == 0) { xsoftc.buses_config_done = 1; xsoftc.buses_to_config++; TASK_INIT(&xsoftc.boot_task, 0, xpt_finishconfig_task, NULL); taskqueue_enqueue(taskqueue_thread, &xsoftc.boot_task); } else root_mount_rel(&xsoftc.xpt_rootmount); } xpt_unlock_buses(); } /* * If the given device only has one peripheral attached to it, and if that * peripheral is the passthrough driver, announce it. This insures that the * user sees some sort of announcement for every peripheral in their system. */ static int xptpassannouncefunc(struct cam_ed *device, void *arg) { struct cam_periph *periph; int i; for (periph = SLIST_FIRST(&device->periphs), i = 0; periph != NULL; periph = SLIST_NEXT(periph, periph_links), i++); periph = SLIST_FIRST(&device->periphs); if ((i == 1) && (strncmp(periph->periph_name, "pass", 4) == 0)) xpt_announce_periph(periph, NULL); return(1); } static void xpt_finishconfig_task(void *context, int pending) { periphdriver_init(2); /* * Check for devices with no "standard" peripheral driver * attached. For any devices like that, announce the * passthrough driver so the user will see something. */ if (!bootverbose) xpt_for_all_devices(xptpassannouncefunc, NULL); xpt_release_boot(); } cam_status xpt_register_async(int event, ac_callback_t *cbfunc, void *cbarg, struct cam_path *path) { struct ccb_setasync csa; cam_status status; int xptpath = 0; if (path == NULL) { status = xpt_create_path(&path, /*periph*/NULL, CAM_XPT_PATH_ID, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (status != CAM_REQ_CMP) return (status); xpt_path_lock(path); xptpath = 1; } xpt_setup_ccb(&csa.ccb_h, path, CAM_PRIORITY_NORMAL); csa.ccb_h.func_code = XPT_SASYNC_CB; csa.event_enable = event; csa.callback = cbfunc; csa.callback_arg = cbarg; xpt_action((union ccb *)&csa); status = csa.ccb_h.status; CAM_DEBUG(csa.ccb_h.path, CAM_DEBUG_TRACE, ("xpt_register_async: func %p\n", cbfunc)); if (xptpath) { xpt_path_unlock(path); xpt_free_path(path); } if ((status == CAM_REQ_CMP) && (csa.event_enable & AC_FOUND_DEVICE)) { /* * Get this peripheral up to date with all * the currently existing devices. */ xpt_for_all_devices(xptsetasyncfunc, &csa); } if ((status == CAM_REQ_CMP) && (csa.event_enable & AC_PATH_REGISTERED)) { /* * Get this peripheral up to date with all * the currently existing buses. */ xpt_for_all_busses(xptsetasyncbusfunc, &csa); } return (status); } static void xptaction(struct cam_sim *sim, union ccb *work_ccb) { CAM_DEBUG(work_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xptaction\n")); switch (work_ccb->ccb_h.func_code) { /* Common cases first */ case XPT_PATH_INQ: /* Path routing inquiry */ { struct ccb_pathinq *cpi; cpi = &work_ccb->cpi; cpi->version_num = 1; /* XXX??? */ cpi->hba_inquiry = 0; cpi->target_sprt = 0; cpi->hba_misc = 0; cpi->hba_eng_cnt = 0; cpi->max_target = 0; cpi->max_lun = 0; cpi->initiator_id = 0; strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strlcpy(cpi->hba_vid, "", HBA_IDLEN); strlcpy(cpi->dev_name, sim->sim_name, DEV_IDLEN); cpi->unit_number = sim->unit_number; cpi->bus_id = sim->bus_id; cpi->base_transfer_speed = 0; cpi->protocol = PROTO_UNSPECIFIED; cpi->protocol_version = PROTO_VERSION_UNSPECIFIED; cpi->transport = XPORT_UNSPECIFIED; cpi->transport_version = XPORT_VERSION_UNSPECIFIED; cpi->ccb_h.status = CAM_REQ_CMP; xpt_done(work_ccb); break; } default: work_ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(work_ccb); break; } } /* * The xpt as a "controller" has no interrupt sources, so polling * is a no-op. */ static void xptpoll(struct cam_sim *sim) { } void xpt_lock_buses(void) { mtx_lock(&xsoftc.xpt_topo_lock); } void xpt_unlock_buses(void) { mtx_unlock(&xsoftc.xpt_topo_lock); } struct mtx * xpt_path_mtx(struct cam_path *path) { return (&path->device->device_mtx); } static void xpt_done_process(struct ccb_hdr *ccb_h) { struct cam_sim *sim = NULL; struct cam_devq *devq = NULL; struct mtx *mtx = NULL; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) struct ccb_scsiio *csio; if (ccb_h->func_code == XPT_SCSI_IO) { csio = &((union ccb *)ccb_h)->csio; if (csio->bio != NULL) biotrack(csio->bio, __func__); } #endif if (ccb_h->flags & CAM_HIGH_POWER) { struct highpowerlist *hphead; struct cam_ed *device; mtx_lock(&xsoftc.xpt_highpower_lock); hphead = &xsoftc.highpowerq; device = STAILQ_FIRST(hphead); /* * Increment the count since this command is done. */ xsoftc.num_highpower++; /* * Any high powered commands queued up? */ if (device != NULL) { STAILQ_REMOVE_HEAD(hphead, highpowerq_entry); mtx_unlock(&xsoftc.xpt_highpower_lock); mtx_lock(&device->sim->devq->send_mtx); xpt_release_devq_device(device, /*count*/1, /*runqueue*/TRUE); mtx_unlock(&device->sim->devq->send_mtx); } else mtx_unlock(&xsoftc.xpt_highpower_lock); } /* * Insulate against a race where the periph is destroyed but CCBs are * still not all processed. This shouldn't happen, but allows us better * bug diagnostic when it does. */ if (ccb_h->path->bus) sim = ccb_h->path->bus->sim; if (ccb_h->status & CAM_RELEASE_SIMQ) { KASSERT(sim, ("sim missing for CAM_RELEASE_SIMQ request")); xpt_release_simq(sim, /*run_queue*/FALSE); ccb_h->status &= ~CAM_RELEASE_SIMQ; } if ((ccb_h->flags & CAM_DEV_QFRZDIS) && (ccb_h->status & CAM_DEV_QFRZN)) { xpt_release_devq(ccb_h->path, /*count*/1, /*run_queue*/TRUE); ccb_h->status &= ~CAM_DEV_QFRZN; } if ((ccb_h->func_code & XPT_FC_USER_CCB) == 0) { struct cam_ed *dev = ccb_h->path->device; if (sim) devq = sim->devq; KASSERT(devq, ("Periph disappeared with request pending.")); mtx_lock(&devq->send_mtx); devq->send_active--; devq->send_openings++; cam_ccbq_ccb_done(&dev->ccbq, (union ccb *)ccb_h); if (((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0 && (dev->ccbq.dev_active == 0))) { dev->flags &= ~CAM_DEV_REL_ON_QUEUE_EMPTY; xpt_release_devq_device(dev, /*count*/1, /*run_queue*/FALSE); } if (((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0 && (ccb_h->status&CAM_STATUS_MASK) != CAM_REQUEUE_REQ)) { dev->flags &= ~CAM_DEV_REL_ON_COMPLETE; xpt_release_devq_device(dev, /*count*/1, /*run_queue*/FALSE); } if (!device_is_queued(dev)) (void)xpt_schedule_devq(devq, dev); xpt_run_devq(devq); mtx_unlock(&devq->send_mtx); if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0) { mtx = xpt_path_mtx(ccb_h->path); mtx_lock(mtx); if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0 && (--dev->tag_delay_count == 0)) xpt_start_tags(ccb_h->path); } } if ((ccb_h->flags & CAM_UNLOCKED) == 0) { if (mtx == NULL) { mtx = xpt_path_mtx(ccb_h->path); mtx_lock(mtx); } } else { if (mtx != NULL) { mtx_unlock(mtx); mtx = NULL; } } /* Call the peripheral driver's callback */ ccb_h->pinfo.index = CAM_UNQUEUED_INDEX; (*ccb_h->cbfcnp)(ccb_h->path->periph, (union ccb *)ccb_h); if (mtx != NULL) mtx_unlock(mtx); } void xpt_done_td(void *arg) { struct cam_doneq *queue = arg; struct ccb_hdr *ccb_h; STAILQ_HEAD(, ccb_hdr) doneq; STAILQ_INIT(&doneq); mtx_lock(&queue->cam_doneq_mtx); while (1) { while (STAILQ_EMPTY(&queue->cam_doneq)) { queue->cam_doneq_sleep = 1; msleep(&queue->cam_doneq, &queue->cam_doneq_mtx, PRIBIO, "-", 0); queue->cam_doneq_sleep = 0; } STAILQ_CONCAT(&doneq, &queue->cam_doneq); mtx_unlock(&queue->cam_doneq_mtx); THREAD_NO_SLEEPING(); while ((ccb_h = STAILQ_FIRST(&doneq)) != NULL) { STAILQ_REMOVE_HEAD(&doneq, sim_links.stqe); xpt_done_process(ccb_h); } THREAD_SLEEPING_OK(); mtx_lock(&queue->cam_doneq_mtx); } } static void camisr_runqueue(void) { struct ccb_hdr *ccb_h; struct cam_doneq *queue; int i; /* Process global queues. */ for (i = 0; i < cam_num_doneqs; i++) { queue = &cam_doneqs[i]; mtx_lock(&queue->cam_doneq_mtx); while ((ccb_h = STAILQ_FIRST(&queue->cam_doneq)) != NULL) { STAILQ_REMOVE_HEAD(&queue->cam_doneq, sim_links.stqe); mtx_unlock(&queue->cam_doneq_mtx); xpt_done_process(ccb_h); mtx_lock(&queue->cam_doneq_mtx); } mtx_unlock(&queue->cam_doneq_mtx); } } struct kv { uint32_t v; const char *name; }; static struct kv map[] = { { XPT_NOOP, "XPT_NOOP" }, { XPT_SCSI_IO, "XPT_SCSI_IO" }, { XPT_GDEV_TYPE, "XPT_GDEV_TYPE" }, { XPT_GDEVLIST, "XPT_GDEVLIST" }, { XPT_PATH_INQ, "XPT_PATH_INQ" }, { XPT_REL_SIMQ, "XPT_REL_SIMQ" }, { XPT_SASYNC_CB, "XPT_SASYNC_CB" }, { XPT_SDEV_TYPE, "XPT_SDEV_TYPE" }, { XPT_SCAN_BUS, "XPT_SCAN_BUS" }, { XPT_DEV_MATCH, "XPT_DEV_MATCH" }, { XPT_DEBUG, "XPT_DEBUG" }, { XPT_PATH_STATS, "XPT_PATH_STATS" }, { XPT_GDEV_STATS, "XPT_GDEV_STATS" }, { XPT_DEV_ADVINFO, "XPT_DEV_ADVINFO" }, { XPT_ASYNC, "XPT_ASYNC" }, { XPT_ABORT, "XPT_ABORT" }, { XPT_RESET_BUS, "XPT_RESET_BUS" }, { XPT_RESET_DEV, "XPT_RESET_DEV" }, { XPT_TERM_IO, "XPT_TERM_IO" }, { XPT_SCAN_LUN, "XPT_SCAN_LUN" }, { XPT_GET_TRAN_SETTINGS, "XPT_GET_TRAN_SETTINGS" }, { XPT_SET_TRAN_SETTINGS, "XPT_SET_TRAN_SETTINGS" }, { XPT_CALC_GEOMETRY, "XPT_CALC_GEOMETRY" }, { XPT_ATA_IO, "XPT_ATA_IO" }, { XPT_GET_SIM_KNOB, "XPT_GET_SIM_KNOB" }, { XPT_SET_SIM_KNOB, "XPT_SET_SIM_KNOB" }, { XPT_NVME_IO, "XPT_NVME_IO" }, { XPT_MMC_IO, "XPT_MMC_IO" }, { XPT_SMP_IO, "XPT_SMP_IO" }, { XPT_SCAN_TGT, "XPT_SCAN_TGT" }, { XPT_NVME_ADMIN, "XPT_NVME_ADMIN" }, { XPT_ENG_INQ, "XPT_ENG_INQ" }, { XPT_ENG_EXEC, "XPT_ENG_EXEC" }, { XPT_EN_LUN, "XPT_EN_LUN" }, { XPT_TARGET_IO, "XPT_TARGET_IO" }, { XPT_ACCEPT_TARGET_IO, "XPT_ACCEPT_TARGET_IO" }, { XPT_CONT_TARGET_IO, "XPT_CONT_TARGET_IO" }, { XPT_IMMED_NOTIFY, "XPT_IMMED_NOTIFY" }, { XPT_NOTIFY_ACK, "XPT_NOTIFY_ACK" }, { XPT_IMMEDIATE_NOTIFY, "XPT_IMMEDIATE_NOTIFY" }, { XPT_NOTIFY_ACKNOWLEDGE, "XPT_NOTIFY_ACKNOWLEDGE" }, { 0, 0 } }; const char * xpt_action_name(uint32_t action) { static char buffer[32]; /* Only for unknown messages -- racy */ struct kv *walker = map; while (walker->name != NULL) { if (walker->v == action) return (walker->name); walker++; } snprintf(buffer, sizeof(buffer), "%#x", action); return (buffer); } Index: stable/12/sys/geom/geom_io.c =================================================================== --- stable/12/sys/geom/geom_io.c (revision 355609) +++ stable/12/sys/geom/geom_io.c (revision 355610) @@ -1,1095 +1,1095 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int g_io_transient_map_bio(struct bio *bp); static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; /* * Pace is a hint that we've had some trouble recently allocating * bios, so we should back off trying to send I/O down the stack * a bit to let the problem resolve. When pacing, we also turn * off direct dispatch to also reduce memory pressure from I/Os * there, at the expxense of some added latency while the memory * pressures exist. See g_io_schedule_down() for more details * and limitations. */ -static volatile u_int pace; +static volatile u_int __read_mostly pace; -static uma_zone_t biozone; +static uma_zone_t __read_mostly biozone; /* * The head of the list of classifiers used in g_io_request. * Use g_register_classifier() and g_unregister_classifier() * to add/remove entries to the list. * Classifiers are invoked in registration order. */ -static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook) - g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq); +static TAILQ_HEAD(, g_classifier_hook) g_classifier_tailq __read_mostly = + TAILQ_HEAD_INITIALIZER(g_classifier_tailq); #include static void g_bioq_lock(struct g_bioq *bq) { mtx_lock(&bq->bio_queue_lock); } static void g_bioq_unlock(struct g_bioq *bq) { mtx_unlock(&bq->bio_queue_lock); } #if 0 static void g_bioq_destroy(struct g_bioq *bq) { mtx_destroy(&bq->bio_queue_lock); } #endif static void g_bioq_init(struct g_bioq *bq) { TAILQ_INIT(&bq->bio_queue); mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); } static struct bio * g_bioq_first(struct g_bioq *bq) { struct bio *bp; bp = TAILQ_FIRST(&bq->bio_queue); if (bp != NULL) { KASSERT((bp->bio_flags & BIO_ONQUEUE), ("Bio not on queue bp=%p target %p", bp, bq)); bp->bio_flags &= ~BIO_ONQUEUE; TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); bq->bio_queue_length--; } return (bp); } struct bio * g_new_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_new_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return (bp); } struct bio * g_alloc_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return (bp); } void g_destroy_bio(struct bio *bp) { #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif uma_zfree(biozone, bp); } struct bio * g_clone_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); if (bp2 != NULL) { bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; if (bp->bio_cmd == BIO_ZONE) bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; bp2->bio_classifier2 = bp->bio_classifier2; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) bp2->bio_track_bp = bp->bio_track_bp; #endif bp->bio_children++; } #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return(bp2); } struct bio * g_duplicate_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return(bp2); } void g_reset_bio(struct bio *bp) { bzero(bp, sizeof(*bp)); } void g_io_init() { g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); biozone = uma_zcreate("g_bio", sizeof (struct bio), NULL, NULL, NULL, NULL, 0, 0); } int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_getattr(%s)", attr); bp = g_alloc_bio(); bp->bio_cmd = BIO_GETATTR; bp->bio_done = NULL; bp->bio_attribute = attr; bp->bio_length = *len; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "ggetattr"); *len = bp->bio_completed; g_destroy_bio(bp); return (error); } int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd); bp = g_alloc_bio(); bp->bio_cmd = BIO_ZONE; bp->bio_done = NULL; /* * XXX KDM need to handle report zone data. */ bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args)); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) bp->bio_length = zone_args->zone_params.report.entries_allocated * sizeof(struct disk_zone_rep_entry); else bp->bio_length = 0; g_io_request(bp, cp); error = biowait(bp, "gzone"); bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args)); g_destroy_bio(bp); return (error); } int g_io_flush(struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); bp = g_alloc_bio(); bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_done = NULL; bp->bio_attribute = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gflush"); g_destroy_bio(bp); return (error); } static int g_io_check(struct bio *bp) { struct g_consumer *cp; struct g_provider *pp; off_t excess; int error; biotrack(bp, __func__); cp = bp->bio_from; pp = bp->bio_to; /* Fail if access counters dont allow the operation */ switch(bp->bio_cmd) { case BIO_READ: case BIO_GETATTR: if (cp->acr == 0) return (EPERM); break; case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: if (cp->acw == 0) return (EPERM); break; case BIO_ZONE: if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) || (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) { if (cp->acr == 0) return (EPERM); } else if (cp->acw == 0) return (EPERM); break; default: return (EPERM); } /* if provider is marked for error, don't disturb. */ if (pp->error) return (pp->error); if (cp->flags & G_CF_ORPHAN) return (ENXIO); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* Zero sectorsize or mediasize is probably a lack of media. */ if (pp->sectorsize == 0 || pp->mediasize == 0) return (ENXIO); /* Reject I/O not on sector boundary */ if (bp->bio_offset % pp->sectorsize) return (EINVAL); /* Reject I/O not integral sector long */ if (bp->bio_length % pp->sectorsize) return (EINVAL); /* Reject requests before or past the end of media. */ if (bp->bio_offset < 0) return (EIO); if (bp->bio_offset > pp->mediasize) return (EIO); /* Truncate requests to the end of providers media. */ excess = bp->bio_offset + bp->bio_length; if (excess > bp->bio_to->mediasize) { KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, ("excess bio %p too short", bp)); excess -= bp->bio_to->mediasize; bp->bio_length -= excess; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; } if (excess > 0) CTR3(KTR_GEOM, "g_down truncated bio " "%p provider %s by %d", bp, bp->bio_to->name, excess); } /* Deliver zero length transfers right here. */ if (bp->bio_length == 0) { CTR2(KTR_GEOM, "g_down terminated 0-length " "bp %p provider %s", bp, bp->bio_to->name); return (0); } if ((bp->bio_flags & BIO_UNMAPPED) != 0 && (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { if ((error = g_io_transient_map_bio(bp)) >= 0) return (error); } break; default: break; } return (EJUSTRETURN); } /* * bio classification support. * * g_register_classifier() and g_unregister_classifier() * are used to add/remove a classifier from the list. * The list is protected using the g_bio_run_down lock, * because the classifiers are called in this path. * * g_io_request() passes bio's that are not already classified * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers(). * Classifiers can store their result in the two fields * bio_classifier1 and bio_classifier2. * A classifier that updates one of the fields should * return a non-zero value. * If no classifier updates the field, g_run_classifiers() sets * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls. */ int g_register_classifier(struct g_classifier_hook *hook) { g_bioq_lock(&g_bio_run_down); TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link); g_bioq_unlock(&g_bio_run_down); return (0); } void g_unregister_classifier(struct g_classifier_hook *hook) { struct g_classifier_hook *entry; g_bioq_lock(&g_bio_run_down); TAILQ_FOREACH(entry, &g_classifier_tailq, link) { if (entry == hook) { TAILQ_REMOVE(&g_classifier_tailq, hook, link); break; } } g_bioq_unlock(&g_bio_run_down); } static void g_run_classifiers(struct bio *bp) { struct g_classifier_hook *hook; int classified = 0; biotrack(bp, __func__); TAILQ_FOREACH(hook, &g_classifier_tailq, link) classified |= hook->func(hook->arg, bp); if (!classified) bp->bio_classifier1 = BIO_NOTCLASSIFIED; } void g_io_request(struct bio *bp, struct g_consumer *cp) { struct g_provider *pp; struct mtx *mtxp; int direct, error, first; uint8_t cmd; biotrack(bp, __func__); KASSERT(cp != NULL, ("NULL cp in g_io_request")); KASSERT(bp != NULL, ("NULL bp in g_io_request")); pp = cp->provider; KASSERT(pp != NULL, ("consumer not attached in g_io_request")); #ifdef DIAGNOSTIC KASSERT(bp->bio_driver1 == NULL, ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_driver2 == NULL, ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_pflags == 0, ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); /* * Remember consumer's private fields, so we can detect if they were * modified by the provider. */ bp->_bio_caller1 = bp->bio_caller1; bp->_bio_caller2 = bp->bio_caller2; bp->_bio_cflags = bp->bio_cflags; #endif cmd = bp->bio_cmd; if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_DELETE || cmd == BIO_FLUSH) { KASSERT(bp->bio_data == NULL, ("non-NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, ("wrong offset %jd for sectorsize %u", bp->bio_offset, cp->provider->sectorsize)); KASSERT(bp->bio_length % cp->provider->sectorsize == 0, ("wrong length %jd for sectorsize %u", bp->bio_length, cp->provider->sectorsize)); } g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); bp->bio_from = cp; bp->bio_to = pp; bp->bio_error = 0; bp->bio_completed = 0; KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&bp->bio_t0); else getbinuptime(&bp->bio_t0); #ifdef GET_STACK_USAGE direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && !g_is_geom_thread(curthread) && ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && pace == 0; if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) { g_bioq_lock(&g_bio_run_down); g_run_classifiers(bp); g_bioq_unlock(&g_bio_run_down); } /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ mtxp = mtx_pool_find(mtxpool_sleep, pp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_start_transaction(pp->stat, &bp->bio_t0); if (g_collectstats & G_STATS_CONSUMERS) devstat_start_transaction(cp->stat, &bp->bio_t0); pp->nstart++; cp->nstart++; mtx_unlock(mtxp); if (direct) { error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " "provider %s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); return; } bp->bio_to->geom->start(bp); } else { g_bioq_lock(&g_bio_run_down); first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_down.bio_queue_length++; g_bioq_unlock(&g_bio_run_down); /* Pass it on down. */ if (first) wakeup(&g_wait_down); } } void g_io_deliver(struct bio *bp, int error) { struct bintime now; struct g_consumer *cp; struct g_provider *pp; struct mtx *mtxp; int direct, first; biotrack(bp, __func__); KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); pp = bp->bio_to; KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); cp = bp->bio_from; if (cp == NULL) { bp->bio_error = error; bp->bio_done(bp); return; } KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); #ifdef DIAGNOSTIC /* * Some classes - GJournal in particular - can modify bio's * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO * flag means it's an expected behaviour for that particular geom. */ if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { KASSERT(bp->bio_caller1 == bp->_bio_caller1, ("bio_caller1 used by the provider %s", pp->name)); KASSERT(bp->bio_caller2 == bp->_bio_caller2, ("bio_caller2 used by the provider %s", pp->name)); KASSERT(bp->bio_cflags == bp->_bio_cflags, ("bio_cflags used by the provider %s", pp->name)); } #endif KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); KASSERT(bp->bio_completed <= bp->bio_length, ("bio_completed can't be greater than bio_length")); g_trace(G_T_BIO, "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); /* * XXX: next two doesn't belong here */ bp->bio_bcount = bp->bio_length; bp->bio_resid = bp->bio_bcount - bp->bio_completed; #ifdef GET_STACK_USAGE direct = (pp->flags & G_PF_DIRECT_SEND) && (cp->flags & G_CF_DIRECT_RECEIVE) && !g_is_geom_thread(curthread); if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&now); mtxp = mtx_pool_find(mtxpool_sleep, cp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_end_transaction_bio_bt(pp->stat, bp, &now); if (g_collectstats & G_STATS_CONSUMERS) devstat_end_transaction_bio_bt(cp->stat, bp, &now); cp->nend++; pp->nend++; mtx_unlock(mtxp); if (error != ENOMEM) { bp->bio_error = error; if (direct) { biodone(bp); } else { g_bioq_lock(&g_bio_run_up); first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_up.bio_queue_length++; g_bioq_unlock(&g_bio_run_up); if (first) wakeup(&g_wait_up); } return; } if (bootverbose) printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); bp->bio_children = 0; bp->bio_inbed = 0; bp->bio_driver1 = NULL; bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); pace = 1; return; } SYSCTL_DECL(_kern_geom); static long transient_maps; SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, &transient_maps, 0, "Total count of the transient mapping requests"); u_int transient_map_retries = 10; SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, &transient_map_retries, 0, "Max count of retries used before giving up on creating transient map"); int transient_map_hard_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, &transient_map_hard_failures, 0, "Failures to establish the transient mapping due to retry attempts " "exhausted"); int transient_map_soft_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, &transient_map_soft_failures, 0, "Count of retried failures to establish the transient mapping"); int inflight_transient_maps; SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, &inflight_transient_maps, 0, "Current count of the active transient maps"); static int g_io_transient_map_bio(struct bio *bp) { vm_offset_t addr; long size; u_int retried; KASSERT(unmapped_buf_allowed, ("unmapped disabled")); size = round_page(bp->bio_ma_offset + bp->bio_length); KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); addr = 0; retried = 0; atomic_add_long(&transient_maps, 1); retry: if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { if (transient_map_retries != 0 && retried >= transient_map_retries) { CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", bp, bp->bio_to->name); atomic_add_int(&transient_map_hard_failures, 1); return (EDEADLK/* XXXKIB */); } else { /* * Naive attempt to quisce the I/O to get more * in-flight requests completed and defragment * the transient_arena. */ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", bp, bp->bio_to->name, retried); pause("g_d_tra", hz / 10); retried++; atomic_add_int(&transient_map_soft_failures, 1); goto retry; } } atomic_add_int(&inflight_transient_maps, 1); pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; bp->bio_flags |= BIO_TRANSIENT_MAPPING; bp->bio_flags &= ~BIO_UNMAPPED; return (EJUSTRETURN); } void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; int error; for(;;) { g_bioq_lock(&g_bio_run_down); bp = g_bioq_first(&g_bio_run_down); if (bp == NULL) { CTR0(KTR_GEOM, "g_down going to sleep"); msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); biotrack(bp, __func__); if (pace != 0) { /* * There has been at least one memory allocation * failure since the last I/O completed. Pause 1ms to * give the system a chance to free up memory. We only * do this once because a large number of allocations * can fail in the direct dispatch case and there's no * relationship between the number of these failures and * the length of the outage. If there's still an outage, * we'll pause again and again until it's * resolved. Older versions paused longer and once per * allocation failure. This was OK for a single threaded * g_down, but with direct dispatch would lead to max of * 10 IOPs for minutes at a time when transient memory * issues prevented allocation for a batch of requests * from the upper layers. * * XXX This pacing is really lame. It needs to be solved * by other methods. This is OK only because the worst * case scenario is so rare. In the worst case scenario * all memory is tied up waiting for I/O to complete * which can never happen since we can't allocate bios * for that I/O. */ CTR0(KTR_GEOM, "g_down pacing self"); pause("g_down", min(hz/1000, 1)); pace = 0; } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name); error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " "%s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); continue; } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); bp->bio_to->geom->start(bp); THREAD_SLEEPING_OK(); } } void g_io_schedule_up(struct thread *tp __unused) { struct bio *bp; for(;;) { g_bioq_lock(&g_bio_run_up); bp = g_bioq_first(&g_bio_run_up); if (bp == NULL) { CTR0(KTR_GEOM, "g_up going to sleep"); msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " "%jd len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); biodone(bp); THREAD_SLEEPING_OK(); } } void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) { struct bio *bp; void *ptr; int errorc; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_read_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; ptr = g_malloc(length, M_WAITOK); bp->bio_data = ptr; g_io_request(bp, cp); errorc = biowait(bp, "gread"); if (error != NULL) *error = errorc; g_destroy_bio(bp); if (errorc) { g_free(ptr); ptr = NULL; } return (ptr); } /* * A read function for use by ffs_sbget when used by GEOM-layer routines. */ int g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size) { struct g_consumer *cp; KASSERT(*bufp == NULL, ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp)); cp = (struct g_consumer *)devfd; /* * Take care not to issue an invalid I/O request. The offset of * the superblock candidate must be multiples of the provider's * sector size, otherwise an FFS can't exist on the provider * anyway. */ if (loc % cp->provider->sectorsize != 0) return (ENOENT); *bufp = g_read_data(cp, loc, size, NULL); if (*bufp == NULL) return (ENOENT); return (0); } int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_write_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "gwrite"); g_destroy_bio(bp); return (error); } /* * A write function for use by ffs_sbput when used by GEOM-layer routines. */ int g_use_g_write_data(void *devfd, off_t loc, void *buf, int size) { return (g_write_data((struct g_consumer *)devfd, loc, buf, size)); } int g_delete_data(struct g_consumer *cp, off_t offset, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize, ("g_delete_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_DELETE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gdelete"); g_destroy_bio(bp); return (error); } void g_print_bio(struct bio *bp) { const char *pname, *cmd = NULL; if (bp->bio_to != NULL) pname = bp->bio_to->name; else pname = "[unknown]"; switch (bp->bio_cmd) { case BIO_GETATTR: cmd = "GETATTR"; printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); return; case BIO_FLUSH: cmd = "FLUSH"; printf("%s[%s]", pname, cmd); return; case BIO_ZONE: { char *subcmd = NULL; cmd = "ZONE"; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: subcmd = "OPEN"; break; case DISK_ZONE_CLOSE: subcmd = "CLOSE"; break; case DISK_ZONE_FINISH: subcmd = "FINISH"; break; case DISK_ZONE_RWP: subcmd = "RWP"; break; case DISK_ZONE_REPORT_ZONES: subcmd = "REPORT ZONES"; break; case DISK_ZONE_GET_PARAMS: subcmd = "GET PARAMS"; break; default: subcmd = "UNKNOWN"; break; } printf("%s[%s,%s]", pname, cmd, subcmd); return; } case BIO_READ: cmd = "READ"; break; case BIO_WRITE: cmd = "WRITE"; break; case BIO_DELETE: cmd = "DELETE"; break; default: cmd = "UNKNOWN"; printf("%s[%s()]", pname, cmd); return; } printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); } Index: stable/12/sys/geom/geom_kern.c =================================================================== --- stable/12/sys/geom/geom_kern.c (revision 355609) +++ stable/12/sys/geom/geom_kern.c (revision 355610) @@ -1,240 +1,240 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures"); struct sx topology_lock; static struct proc *g_proc; -static struct thread *g_up_td; -static struct thread *g_down_td; -static struct thread *g_event_td; +static struct thread __read_mostly *g_up_td; +static struct thread __read_mostly *g_down_td; +static struct thread __read_mostly *g_event_td; -int g_debugflags; -int g_collectstats = 1; +int __read_mostly g_debugflags; +int __read_mostly g_collectstats = G_STATS_PROVIDERS; int g_shutdown; int g_notaste; /* * G_UP and G_DOWN are the two threads which push I/O through the * stack. * * Things are procesed in a FIFO order, but these threads could be * part of I/O prioritization by deciding which bios/bioqs to service * in what order. * * We have only one thread in each direction, it is believed that until * a very non-trivial workload in the UP/DOWN path this will be enough, * but more than one can actually be run without problems. * * Holding the "mymutex" is a debugging feature: It prevents people * from sleeping in the UP/DOWN I/O path by mistake or design (doing * so almost invariably result in deadlocks since it stalls all I/O * processing in the given direction. */ static void g_up_procbody(void *arg) { thread_lock(g_up_td); sched_prio(g_up_td, PRIBIO); thread_unlock(g_up_td); for(;;) { g_io_schedule_up(g_up_td); } } static void g_down_procbody(void *arg) { thread_lock(g_down_td); sched_prio(g_down_td, PRIBIO); thread_unlock(g_down_td); for(;;) { g_io_schedule_down(g_down_td); } } static void g_event_procbody(void *arg) { thread_lock(g_event_td); sched_prio(g_event_td, PRIBIO); thread_unlock(g_event_td); g_run_events(); /* NOTREACHED */ } int g_is_geom_thread(struct thread *td) { return (td == g_up_td || td == g_down_td || td == g_event_td); } static void geom_shutdown(void *foo __unused) { g_shutdown = 1; } void g_init(void) { g_trace(G_T_TOPOLOGY, "g_ignition"); sx_init(&topology_lock, "GEOM topology"); g_io_init(); g_event_init(); g_ctl_init(); kproc_kthread_add(g_event_procbody, NULL, &g_proc, &g_event_td, RFHIGHPID, 0, "geom", "g_event"); kproc_kthread_add(g_up_procbody, NULL, &g_proc, &g_up_td, RFHIGHPID, 0, "geom", "g_up"); kproc_kthread_add(g_down_procbody, NULL, &g_proc, &g_down_td, RFHIGHPID, 0, "geom", "g_down"); EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL, SHUTDOWN_PRI_FIRST); } static int sysctl_kern_geom_confany(struct sysctl_req *req, g_event_t *func, size_t *hint) { size_t len = 0; int error = 0; struct sbuf *sb; if (req->oldptr == NULL) { sb = sbuf_new(NULL, NULL, PAGE_SIZE, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(sb, sbuf_count_drain, &len); g_waitfor_event(func, sb, M_WAITOK, NULL); req->oldidx = *hint = len; } else { sb = sbuf_new(NULL, NULL, *hint, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); g_waitfor_event(func, sb, M_WAITOK, NULL); *hint = sbuf_len(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); } sbuf_delete(sb); return error; } static int sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_conftxt, &hint)); } static int sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_confdot, &hint)); } static int sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_confxml, &hint)); } SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW, 0, "GEOMetry management"); SYSCTL_PROC(_kern_geom, OID_AUTO, confxml, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_confxml, "", "Dump the GEOM config in XML"); SYSCTL_PROC(_kern_geom, OID_AUTO, confdot, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_confdot, "", "Dump the GEOM config in dot"); SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_conftxt, "", "Dump the GEOM config in txt"); SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RWTUN, &g_debugflags, 0, "Set various trace levels for GEOM debugging"); SYSCTL_INT(_kern_geom, OID_AUTO, notaste, CTLFLAG_RW, &g_notaste, 0, "Prevent GEOM tasting"); SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW, &g_collectstats, 0, "Control statistics collection on GEOM providers and consumers"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_class), "sizeof(struct g_class)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_geom), "sizeof(struct g_geom)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_provider), "sizeof(struct g_provider)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_consumer), "sizeof(struct g_consumer)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_bioq), "sizeof(struct g_bioq)"); Index: stable/12/sys/kern/kern_mtxpool.c =================================================================== --- stable/12/sys/kern/kern_mtxpool.c (revision 355609) +++ stable/12/sys/kern/kern_mtxpool.c (revision 355610) @@ -1,190 +1,190 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001 Matthew Dillon. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Mutex pool routines. These routines are designed to be used as short * term leaf mutexes (e.g. the last mutex you might acquire other then * calling msleep()). They operate using a shared pool. A mutex is chosen * from the pool based on the supplied pointer (which may or may not be * valid). * * Advantages: * - no structural overhead. Mutexes can be associated with structures * without adding bloat to the structures. * - mutexes can be obtained for invalid pointers, useful when uses * mutexes to interlock destructor ops. * - no initialization/destructor overhead. * - can be used with msleep. * * Disadvantages: * - should generally only be used as leaf mutexes. * - pool/pool dependency ordering cannot be depended on. * - possible L1 cache mastersip contention between cpus. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool"); /* Pool sizes must be a power of two */ #ifndef MTX_POOL_SLEEP_SIZE #define MTX_POOL_SLEEP_SIZE 1024 #endif struct mtxpool_header { int mtxpool_size; int mtxpool_mask; int mtxpool_shift; int mtxpool_next __aligned(CACHE_LINE_SIZE); }; struct mtx_pool { struct mtxpool_header mtx_pool_header; struct mtx mtx_pool_ary[1]; }; #define mtx_pool_size mtx_pool_header.mtxpool_size #define mtx_pool_mask mtx_pool_header.mtxpool_mask #define mtx_pool_shift mtx_pool_header.mtxpool_shift #define mtx_pool_next mtx_pool_header.mtxpool_next -struct mtx_pool *mtxpool_sleep; +struct mtx_pool __read_frequently *mtxpool_sleep; #if UINTPTR_MAX == UINT64_MAX /* 64 bits */ # define POINTER_BITS 64 # define HASH_MULTIPLIER 11400714819323198485u /* (2^64)*(sqrt(5)-1)/2 */ #else /* assume 32 bits */ # define POINTER_BITS 32 # define HASH_MULTIPLIER 2654435769u /* (2^32)*(sqrt(5)-1)/2 */ #endif /* * Return the (shared) pool mutex associated with the specified address. * The returned mutex is a leaf level mutex, meaning that if you obtain it * you cannot obtain any other mutexes until you release it. You can * legally msleep() on the mutex. */ struct mtx * mtx_pool_find(struct mtx_pool *pool, void *ptr) { int p; KASSERT(pool != NULL, ("_mtx_pool_find(): null pool")); /* * Fibonacci hash, see Knuth's * _Art of Computer Programming, Volume 3 / Sorting and Searching_ */ p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) & pool->mtx_pool_mask; return (&pool->mtx_pool_ary[p]); } static void mtx_pool_initialize(struct mtx_pool *pool, const char *mtx_name, int pool_size, int opts) { int i, maskbits; pool->mtx_pool_size = pool_size; pool->mtx_pool_mask = pool_size - 1; for (i = 1, maskbits = 0; (i & pool_size) == 0; i = i << 1) maskbits++; pool->mtx_pool_shift = POINTER_BITS - maskbits; pool->mtx_pool_next = 0; for (i = 0; i < pool_size; ++i) mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts); } struct mtx_pool * mtx_pool_create(const char *mtx_name, int pool_size, int opts) { struct mtx_pool *pool; if (pool_size <= 0 || !powerof2(pool_size)) { printf("WARNING: %s pool size is not a power of 2.\n", mtx_name); pool_size = 128; } pool = malloc(sizeof (struct mtx_pool) + ((pool_size - 1) * sizeof (struct mtx)), M_MTXPOOL, M_WAITOK | M_ZERO); mtx_pool_initialize(pool, mtx_name, pool_size, opts); return pool; } void mtx_pool_destroy(struct mtx_pool **poolp) { int i; struct mtx_pool *pool = *poolp; for (i = pool->mtx_pool_size - 1; i >= 0; --i) mtx_destroy(&pool->mtx_pool_ary[i]); free(pool, M_MTXPOOL); *poolp = NULL; } static void mtx_pool_setup_dynamic(void *dummy __unused) { mtxpool_sleep = mtx_pool_create("sleep mtxpool", MTX_POOL_SLEEP_SIZE, MTX_DEF); } /* * Obtain a (shared) mutex from the pool. The returned mutex is a leaf * level mutex, meaning that if you obtain it you cannot obtain any other * mutexes until you release it. You can legally msleep() on the mutex. */ struct mtx * mtx_pool_alloc(struct mtx_pool *pool) { int i; KASSERT(pool != NULL, ("mtx_pool_alloc(): null pool")); /* * mtx_pool_next is unprotected against multiple accesses, * but simultaneous access by two CPUs should not be very * harmful. */ i = pool->mtx_pool_next; pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask; return (&pool->mtx_pool_ary[i]); } SYSINIT(mtxpooli2, SI_SUB_MTX_POOL_DYNAMIC, SI_ORDER_FIRST, mtx_pool_setup_dynamic, NULL); Index: stable/12/sys/kern/kern_shutdown.c =================================================================== --- stable/12/sys/kern/kern_shutdown.c (revision 355609) +++ stable/12/sys/kern/kern_shutdown.c (revision 355610) @@ -1,1588 +1,1588 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ekcd.h" #include "opt_kdb.h" #include "opt_panic.h" #include "opt_sched.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer"); #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME; SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN, &panic_reboot_wait_time, 0, "Seconds to wait before rebooting after a panic"); /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED static int debugger_on_panic = 0; #else static int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_panic, 0, "Run debugger on kernel panic"); int debugger_on_trap = 0; SYSCTL_INT(_debug, OID_AUTO, debugger_on_trap, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_trap, 0, "Run debugger on kernel trap before panic"); #ifdef KDB_TRACE static int trace_on_panic = 1; static bool trace_all_panics = true; #else static int trace_on_panic = 0; static bool trace_all_panics = false; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &trace_on_panic, 0, "Print stack trace on kernel panic"); SYSCTL_BOOL(_debug, OID_AUTO, trace_all_panics, CTLFLAG_RWTUN, &trace_all_panics, 0, "Print stack traces on secondary kernel panics"); #endif /* KDB */ static int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); static bool poweroff_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, poweroff_on_panic, CTLFLAG_RWTUN, &poweroff_on_panic, 0, "Do a power off instead of a reboot on a panic"); static bool powercycle_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, powercycle_on_panic, CTLFLAG_RWTUN, &powercycle_on_panic, 0, "Do a power cycle instead of a reboot on a panic"); static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment"); #ifndef DIAGNOSTIC static int show_busybufs; #else static int show_busybufs = 1; #endif SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW, &show_busybufs, 0, ""); int suspend_blocked = 0; SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW, &suspend_blocked, 0, "Block suspend due to a pending shutdown"); #ifdef EKCD FEATURE(ekcd, "Encrypted kernel crash dumps support"); MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data"); struct kerneldumpcrypto { uint8_t kdc_encryption; uint8_t kdc_iv[KERNELDUMP_IV_MAX_SIZE]; keyInstance kdc_ki; cipherInstance kdc_ci; uint32_t kdc_dumpkeysize; struct kerneldumpkey kdc_dumpkey[]; }; #endif struct kerneldumpcomp { uint8_t kdc_format; struct compressor *kdc_stream; uint8_t *kdc_buf; size_t kdc_resid; }; static struct kerneldumpcomp *kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression); static void kerneldumpcomp_destroy(struct dumperinfo *di); static int kerneldumpcomp_write_cb(void *base, size_t len, off_t off, void *arg); static int kerneldump_gzlevel = 6; SYSCTL_INT(_kern, OID_AUTO, kerneldump_gzlevel, CTLFLAG_RWTUN, &kerneldump_gzlevel, 0, "Kernel crash dump compression level"); /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ -const char *panicstr; +const char __read_mostly *panicstr; -int dumping; /* system is dumping */ +int __read_mostly dumping; /* system is dumping */ int rebooting; /* system is rebooting */ static struct dumperinfo dumper; /* our selected dumper */ /* Context information for dump-debuggers. */ static struct pcb dumppcb; /* Registers. */ lwpid_t dumptid; /* Thread ID. */ static struct cdevsw reroot_cdevsw = { .d_version = D_VERSION, .d_name = "reroot", }; static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); static int kern_reroot(void); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL); /* * The only reason this exists is to create the /dev/reroot/ directory, * used by reroot code in init(8) as a mountpoint for tmpfs. */ static void reroot_conf(void *unused) { int error; struct cdev *cdev; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev, &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot"); if (error != 0) { printf("%s: failed to create device node, error %d", __func__, error); } } SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL); /* * The system call that results in a reboot. */ /* ARGSUSED */ int sys_reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_system_check_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = priv_check(td, PRIV_REBOOT); if (error == 0) { if (uap->opt & RB_REROOT) error = kern_reroot(); else kern_reboot(uap->opt); } return (error); } static void shutdown_nice_task_fn(void *arg, int pending __unused) { int howto; howto = (uintptr_t)arg; /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(initproc); if (howto & RB_POWEROFF) kern_psignal(initproc, SIGUSR2); else if (howto & RB_POWERCYCLE) kern_psignal(initproc, SIGWINCH); else if (howto & RB_HALT) kern_psignal(initproc, SIGUSR1); else kern_psignal(initproc, SIGINT); PROC_UNLOCK(initproc); } static struct task shutdown_nice_task = TASK_INITIALIZER(0, &shutdown_nice_task_fn, NULL); /* * Called by events that want to shut down.. e.g on a PC */ void shutdown_nice(int howto) { if (initproc != NULL && !SCHEDULER_STOPPED()) { shutdown_nice_task.ta_context = (void *)(uintptr_t)howto; taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task); } else { /* * No init(8) running, or scheduler would not allow it * to run, so simply reboot. */ kern_reboot(howto | RB_NOSYNC); } } static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } int doadump(boolean_t textdump) { boolean_t coredump; int error; error = 0; if (dumping) return (EBUSY); if (dumper.dumper == NULL) return (ENXIO); savectx(&dumppcb); dumptid = curthread->td_tid; dumping++; coredump = TRUE; #ifdef DDB if (textdump && textdump_pending) { coredump = FALSE; textdump_dumpsys(&dumper); } #endif if (coredump) error = dumpsys(&dumper); dumping--; return (error); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void kern_reboot(int howto) { static int once = 0; /* * Normal paths here don't hold Giant, but we can wind up here * unexpectedly with it held. Drop it now so we don't have to * drop and pick it up elsewhere. The paths it is locking will * never be returned to, and it is preferable to preclude * deadlock than to lock against code that won't ever * continue. */ while (mtx_owned(&Giant)) mtx_unlock(&Giant); #if defined(SMP) /* * Bind us to the first CPU so that all shutdown code runs there. Some * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ if (!SCHEDULER_STOPPED()) { thread_lock(curthread); sched_bind(curthread, CPU_FIRST()); thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("boot: not running on cpu 0")); } #endif /* We're in the process of rebooting. */ rebooting = 1; /* We are out of the debugger now. */ kdb_active = 0; /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); /* * Now sync filesystems */ if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) { once = 1; bufshutdown(show_busybufs); } print_uptime(); cngrab(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(TRUE); /* Now that we're going to really halt the system... */ EVENTHANDLER_INVOKE(shutdown_final, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * The system call that results in changing the rootfs. */ static int kern_reroot(void) { struct vnode *oldrootvnode, *vp; struct mount *mp, *devmp; int error; if (curproc != initproc) return (EPERM); /* * Mark the filesystem containing currently-running executable * (the temporary copy of init(8)) busy. */ vp = curproc->p_textvp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (error); mp = vp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp, 0); error = vfs_busy(mp, 0); vn_lock(vp, LK_SHARED | LK_RETRY); vfs_rel(mp); if (error != 0) { VOP_UNLOCK(vp, 0); return (ENOENT); } if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp, 0); /* * Remove the filesystem containing currently-running executable * from the mount list, to prevent it from being unmounted * by vfs_unmountall(), and to avoid confusing vfs_mountroot(). * * Also preserve /dev - forcibly unmounting it could cause driver * reinitialization. */ vfs_ref(rootdevmp); devmp = rootdevmp; rootdevmp = NULL; mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); TAILQ_REMOVE(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); oldrootvnode = rootvnode; /* * Unmount everything except for the two filesystems preserved above. */ vfs_unmountall(); /* * Add /dev back; vfs_mountroot() will move it into its new place. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); rootdevmp = devmp; vfs_rel(rootdevmp); /* * Mount the new rootfs. */ vfs_mountroot(); /* * Update all references to the old rootvnode. */ mountcheckdirs(oldrootvnode, rootvnode); /* * Add the temporary filesystem back and unbusy it. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_unbusy(mp); return (0); } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); wdog_kern_pat(WD_TO_NEVER); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: break; } } } /* * Check to see if the system paniced, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (panic_reboot_wait_time != 0) { if (panic_reboot_wait_time != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", panic_reboot_wait_time); for (loop = panic_reboot_wait_time * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ /* * Acquiring smp_ipi_mtx here has a double effect: * - it disables interrupts avoiding CPU0 preemption * by fast handlers (thus deadlocking against other CPUs) * - it avoids deadlocks against smp_rendezvous() or, more * generally, threads busy-waiting, with this spinlock held, * and waiting for responses by threads on other CPUs * (ie. smp_tlb_shootdown()). * * For the !SMP case it just needs to handle the former problem. */ #ifdef SMP mtx_lock_spin(&smp_ipi_mtx); #else spinlock_enter(); #endif /* cpu_boot(howto); */ /* doesn't do anything at the moment */ cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #if defined(WITNESS) || defined(INVARIANT_SUPPORT) static int kassert_warn_only = 0; #ifdef KDB static int kassert_do_kdb = 0; #endif #ifdef KTR static int kassert_do_ktr = 0; #endif static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; static int kassert_suppress_in_panic = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options"); #ifdef KASSERT_PANIC_OPTIONAL #define KASSERT_RWTUN CTLFLAG_RWTUN #else #define KASSERT_RWTUN CTLFLAG_RDTUN #endif SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, KASSERT_RWTUN, &kassert_warn_only, 0, "KASSERT triggers a panic (0) or just a warning (1)"); #ifdef KDB SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, KASSERT_RWTUN, &kassert_do_kdb, 0, "KASSERT will enter the debugger"); #endif #ifdef KTR SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, KASSERT_RWTUN, &kassert_do_ktr, 0, "KASSERT does a KTR, set this to the KTRMASK you want"); #endif SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, KASSERT_RWTUN, &kassert_do_log, 0, "If warn_only is enabled, log (1) or do not log (0) assertion violations"); SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RD | CTLFLAG_STATS, &kassert_warnings, 0, "number of KASSERTs that have been triggered"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, KASSERT_RWTUN, &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, KASSERT_RWTUN, &kassert_log_pps_limit, 0, "limit number of log messages per second"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, KASSERT_RWTUN, &kassert_log_mute_at, 0, "max number of KASSERTS to log"); SYSCTL_INT(_debug_kassert, OID_AUTO, suppress_in_panic, KASSERT_RWTUN, &kassert_suppress_in_panic, 0, "KASSERTs will be suppressed while handling a panic"); #undef KASSERT_RWTUN static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, kassert_sysctl_kassert, "I", "set to trigger a test kassert"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); return (0); } #ifdef KASSERT_PANIC_OPTIONAL /* * Called by KASSERT, this decides if we will panic * or if we will log via printf and/or ktr. */ void kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* * If we are suppressing secondary panics, log the warning but do not * re-enter panic/kdb. */ if (panicstr != NULL && kassert_suppress_in_panic) { if (kassert_do_log) { printf("KASSERT failed: %s\n", buf); #ifdef KDB if (trace_all_panics && trace_on_panic) kdb_backtrace(); #endif } return; } /* * panic if we're not just warning, or if we've exceeded * kassert_log_panic_at warnings. */ if (!kassert_warn_only || (kassert_log_panic_at > 0 && kassert_warnings >= kassert_log_panic_at)) { va_start(ap, fmt); vpanic(fmt, ap); /* NORETURN */ } #ifdef KTR if (kassert_do_ktr) CTR0(ktr_mask, buf); #endif /* KTR */ /* * log if we've not yet met the mute limit. */ if (kassert_do_log && (kassert_log_mute_at == 0 || kassert_warnings < kassert_log_mute_at)) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { printf("KASSERT failed: %s\n", buf); kdb_backtrace(); } } #ifdef KDB if (kassert_do_kdb) { kdb_enter(KDB_WHY_KASSERT, buf); } #endif atomic_add_int(&kassert_warnings, 1); } #endif /* KASSERT_PANIC_OPTIONAL */ #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); } void vpanic(const char *fmt, va_list ap) { #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; static char buf[256]; spinlock_enter(); #ifdef SMP /* * stop_cpus_hard(other_cpus) should prevent multiple CPUs from * concurrently entering panic. Only the winner will proceed * further. */ if (panicstr == NULL && !kdb_active) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); } #endif /* * Ensure that the scheduler is stopped while panicking, even if panic * has been entered from kdb. */ td->td_stopsched = 1; bootopt = RB_AUTOBOOT; newpanic = 0; if (panicstr) bootopt |= RB_NOSYNC; else { bootopt |= RB_DUMP; panicstr = fmt; newpanic = 1; } if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; cngrab(); printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif printf("time = %jd\n", (intmax_t )time_second); #ifdef KDB if ((newpanic || trace_all_panics) && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter(KDB_WHY_PANIC, "panic"); #endif /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; if (poweroff_on_panic) bootopt |= RB_POWEROFF; if (powercycle_on_panic) bootopt |= RB_POWERCYCLE; kern_reboot(bootopt); } /* * Support for poweroff delay. * * Please note that setting this delay too short might power off your machine * before the write cache on your hard disk has been flushed, leading to * soft-updates inconsistencies. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)"); static void poweroff_wait(void *junk, int howto) { if ((howto & (RB_POWEROFF | RB_POWERCYCLE)) == 0 || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process"); void kproc_shutdown(void *arg, int howto) { struct proc *p; int error; if (panicstr) return; p = (struct proc *)arg; printf("Waiting (max %d seconds) for system process `%s' to stop... ", kproc_shutdown_wait, p->p_comm); error = kproc_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } void kthread_shutdown(void *arg, int howto) { struct thread *td; int error; if (panicstr) return; td = (struct thread *)arg; printf("Waiting (max %d seconds) for system thread `%s' to stop... ", kproc_shutdown_wait, td->td_name); error = kthread_suspend(td, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)]; SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD, dumpdevname, 0, "Device for kernel dumps"); static int _dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical, size_t length); #ifdef EKCD static struct kerneldumpcrypto * kerneldumpcrypto_create(size_t blocksize, uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey) { struct kerneldumpcrypto *kdc; struct kerneldumpkey *kdk; uint32_t dumpkeysize; dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize); kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO); arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0); kdc->kdc_encryption = encryption; switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0) goto failed; break; default: goto failed; } kdc->kdc_dumpkeysize = dumpkeysize; kdk = kdc->kdc_dumpkey; kdk->kdk_encryption = kdc->kdc_encryption; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); kdk->kdk_encryptedkeysize = htod32(encryptedkeysize); memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize); return (kdc); failed: explicit_bzero(kdc, sizeof(*kdc) + dumpkeysize); free(kdc, M_EKCD); return (NULL); } static int kerneldumpcrypto_init(struct kerneldumpcrypto *kdc) { uint8_t hash[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; struct kerneldumpkey *kdk; int error; error = 0; if (kdc == NULL) return (0); /* * When a user enters ddb it can write a crash dump multiple times. * Each time it should be encrypted using a different IV. */ SHA256_Init(&ctx); SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv)); SHA256_Final(hash, &ctx); bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv)); switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, kdc->kdc_iv) <= 0) { error = EINVAL; goto out; } break; default: error = EINVAL; goto out; } kdk = kdc->kdc_dumpkey; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); out: explicit_bzero(hash, sizeof(hash)); return (error); } static uint32_t kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc) { if (kdc == NULL) return (0); return (kdc->kdc_dumpkeysize); } #endif /* EKCD */ static struct kerneldumpcomp * kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression) { struct kerneldumpcomp *kdcomp; int format; switch (compression) { case KERNELDUMP_COMP_GZIP: format = COMPRESS_GZIP; break; case KERNELDUMP_COMP_ZSTD: format = COMPRESS_ZSTD; break; default: return (NULL); } kdcomp = malloc(sizeof(*kdcomp), M_DUMPER, M_WAITOK | M_ZERO); kdcomp->kdc_format = compression; kdcomp->kdc_stream = compressor_init(kerneldumpcomp_write_cb, format, di->maxiosize, kerneldump_gzlevel, di); if (kdcomp->kdc_stream == NULL) { free(kdcomp, M_DUMPER); return (NULL); } kdcomp->kdc_buf = malloc(di->maxiosize, M_DUMPER, M_WAITOK | M_NODUMP); return (kdcomp); } static void kerneldumpcomp_destroy(struct dumperinfo *di) { struct kerneldumpcomp *kdcomp; kdcomp = di->kdcomp; if (kdcomp == NULL) return; compressor_fini(kdcomp->kdc_stream); explicit_bzero(kdcomp->kdc_buf, di->maxiosize); free(kdcomp->kdc_buf, M_DUMPER); free(kdcomp, M_DUMPER); } /* Registration of dumpers */ int set_dumper(struct dumperinfo *di, const char *devname, struct thread *td, uint8_t compression, uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey) { size_t wantcopy; int error; error = priv_check(td, PRIV_SETDUMPER); if (error != 0) return (error); if (dumper.dumper != NULL) return (EBUSY); dumper = *di; dumper.blockbuf = NULL; dumper.kdcrypto = NULL; dumper.kdcomp = NULL; if (encryption != KERNELDUMP_ENC_NONE) { #ifdef EKCD dumper.kdcrypto = kerneldumpcrypto_create(di->blocksize, encryption, key, encryptedkeysize, encryptedkey); if (dumper.kdcrypto == NULL) { error = EINVAL; goto cleanup; } #else error = EOPNOTSUPP; goto cleanup; #endif } wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname)); if (wantcopy >= sizeof(dumpdevname)) { printf("set_dumper: device name truncated from '%s' -> '%s'\n", devname, dumpdevname); } if (compression != KERNELDUMP_COMP_NONE) { /* * We currently can't support simultaneous encryption and * compression. */ if (encryption != KERNELDUMP_ENC_NONE) { error = EOPNOTSUPP; goto cleanup; } dumper.kdcomp = kerneldumpcomp_create(&dumper, compression); if (dumper.kdcomp == NULL) { error = EINVAL; goto cleanup; } } dumper.blockbuf = malloc(di->blocksize, M_DUMPER, M_WAITOK | M_ZERO); return (0); cleanup: (void)clear_dumper(td); return (error); } int clear_dumper(struct thread *td) { int error; error = priv_check(td, PRIV_SETDUMPER); if (error != 0) return (error); #ifdef NETDUMP netdump_mbuf_drain(); #endif #ifdef EKCD if (dumper.kdcrypto != NULL) { explicit_bzero(dumper.kdcrypto, sizeof(*dumper.kdcrypto) + dumper.kdcrypto->kdc_dumpkeysize); free(dumper.kdcrypto, M_EKCD); } #endif kerneldumpcomp_destroy(&dumper); if (dumper.blockbuf != NULL) { explicit_bzero(dumper.blockbuf, dumper.blocksize); free(dumper.blockbuf, M_DUMPER); } explicit_bzero(&dumper, sizeof(dumper)); dumpdevname[0] = '\0'; return (0); } static int dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length) { if (di->mediasize > 0 && length != 0 && (offset < di->mediaoffset || offset - di->mediaoffset + length > di->mediasize)) { if (di->kdcomp != NULL && offset >= di->mediaoffset) { printf( "Compressed dump failed to fit in device boundaries.\n"); return (E2BIG); } printf("Attempt to write outside dump device boundaries.\n" "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n", (intmax_t)offset, (intmax_t)di->mediaoffset, (uintmax_t)length, (intmax_t)di->mediasize); return (ENOSPC); } if (length % di->blocksize != 0) { printf("Attempt to write partial block of length %ju.\n", (uintmax_t)length); return (EINVAL); } if (offset % di->blocksize != 0) { printf("Attempt to write at unaligned offset %jd.\n", (intmax_t)offset); return (EINVAL); } return (0); } #ifdef EKCD static int dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size) { switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf, 8 * size, buf) <= 0) { return (EIO); } if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, buf + size - 16 /* IV size for AES-256-CBC */) <= 0) { return (EIO); } break; default: return (EINVAL); } return (0); } /* Encrypt data and call dumper. */ static int dump_encrypted_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { static uint8_t buf[KERNELDUMP_BUFFER_SIZE]; struct kerneldumpcrypto *kdc; int error; size_t nbytes; kdc = di->kdcrypto; while (length > 0) { nbytes = MIN(length, sizeof(buf)); bcopy(virtual, buf, nbytes); if (dump_encrypt(kdc, buf, nbytes) != 0) return (EIO); error = dump_write(di, buf, physical, offset, nbytes); if (error != 0) return (error); offset += nbytes; virtual = (void *)((uint8_t *)virtual + nbytes); length -= nbytes; } return (0); } #endif /* EKCD */ static int kerneldumpcomp_write_cb(void *base, size_t length, off_t offset, void *arg) { struct dumperinfo *di; size_t resid, rlength; int error; di = arg; if (length % di->blocksize != 0) { /* * This must be the final write after flushing the compression * stream. Write as many full blocks as possible and stash the * residual data in the dumper's block buffer. It will be * padded and written in dump_finish(). */ rlength = rounddown(length, di->blocksize); if (rlength != 0) { error = _dump_append(di, base, 0, rlength); if (error != 0) return (error); } resid = length - rlength; memmove(di->blockbuf, (uint8_t *)base + rlength, resid); di->kdcomp->kdc_resid = resid; return (EAGAIN); } return (_dump_append(di, base, 0, length)); } /* * Write kernel dump headers at the beginning and end of the dump extent. * Write the kernel dump encryption key after the leading header if we were * configured to do so. */ static int dump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh) { #ifdef EKCD struct kerneldumpcrypto *kdc; #endif void *buf, *key; size_t hdrsz; uint64_t extent; uint32_t keysize; int error; hdrsz = sizeof(*kdh); if (hdrsz > di->blocksize) return (ENOMEM); #ifdef EKCD kdc = di->kdcrypto; key = kdc->kdc_dumpkey; keysize = kerneldumpcrypto_dumpkeysize(kdc); #else key = NULL; keysize = 0; #endif /* * If the dump device has special handling for headers, let it take care * of writing them out. */ if (di->dumper_hdr != NULL) return (di->dumper_hdr(di, kdh, key, keysize)); if (hdrsz == di->blocksize) buf = kdh; else { buf = di->blockbuf; memset(buf, 0, di->blocksize); memcpy(buf, kdh, hdrsz); } extent = dtoh64(kdh->dumpextent); #ifdef EKCD if (kdc != NULL) { error = dump_write(di, kdc->kdc_dumpkey, 0, di->mediaoffset + di->mediasize - di->blocksize - extent - keysize, keysize); if (error != 0) return (error); } #endif error = dump_write(di, buf, 0, di->mediaoffset + di->mediasize - 2 * di->blocksize - extent - keysize, di->blocksize); if (error == 0) error = dump_write(di, buf, 0, di->mediaoffset + di->mediasize - di->blocksize, di->blocksize); return (error); } /* * Don't touch the first SIZEOF_METADATA bytes on the dump device. This is to * protect us from metadata and metadata from us. */ #define SIZEOF_METADATA (64 * 1024) /* * Do some preliminary setup for a kernel dump: initialize state for encryption, * if requested, and make sure that we have enough space on the dump device. * * We set things up so that the dump ends before the last sector of the dump * device, at which the trailing header is written. * * +-----------+------+-----+----------------------------+------+ * | | lhdr | key | ... kernel dump ... | thdr | * +-----------+------+-----+----------------------------+------+ * 1 blk opt <------- dump extent --------> 1 blk * * Dumps written using dump_append() start at the beginning of the extent. * Uncompressed dumps will use the entire extent, but compressed dumps typically * will not. The true length of the dump is recorded in the leading and trailing * headers once the dump has been completed. * * The dump device may provide a callback, in which case it will initialize * dumpoff and take care of laying out the headers. */ int dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh) { uint64_t dumpextent, span; uint32_t keysize; int error; #ifdef EKCD error = kerneldumpcrypto_init(di->kdcrypto); if (error != 0) return (error); keysize = kerneldumpcrypto_dumpkeysize(di->kdcrypto); #else error = 0; keysize = 0; #endif if (di->dumper_start != NULL) { error = di->dumper_start(di); } else { dumpextent = dtoh64(kdh->dumpextent); span = SIZEOF_METADATA + dumpextent + 2 * di->blocksize + keysize; if (di->mediasize < span) { if (di->kdcomp == NULL) return (E2BIG); /* * We don't yet know how much space the compressed dump * will occupy, so try to use the whole swap partition * (minus the first 64KB) in the hope that the * compressed dump will fit. If that doesn't turn out to * be enough, the bounds checking in dump_write() * will catch us and cause the dump to fail. */ dumpextent = di->mediasize - span + dumpextent; kdh->dumpextent = htod64(dumpextent); } /* * The offset at which to begin writing the dump. */ di->dumpoff = di->mediaoffset + di->mediasize - di->blocksize - dumpextent; } di->origdumpoff = di->dumpoff; return (error); } static int _dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical, size_t length) { int error; #ifdef EKCD if (di->kdcrypto != NULL) error = dump_encrypted_write(di, virtual, physical, di->dumpoff, length); else #endif error = dump_write(di, virtual, physical, di->dumpoff, length); if (error == 0) di->dumpoff += length; return (error); } /* * Write to the dump device starting at dumpoff. When compression is enabled, * writes to the device will be performed using a callback that gets invoked * when the compression stream's output buffer is full. */ int dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical, size_t length) { void *buf; if (di->kdcomp != NULL) { /* Bounce through a buffer to avoid CRC errors. */ if (length > di->maxiosize) return (EINVAL); buf = di->kdcomp->kdc_buf; memmove(buf, virtual, length); return (compressor_write(di->kdcomp->kdc_stream, buf, length)); } return (_dump_append(di, virtual, physical, length)); } /* * Write to the dump device at the specified offset. */ int dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { int error; error = dump_check_bounds(di, offset, length); if (error != 0) return (error); return (di->dumper(di->priv, virtual, physical, offset, length)); } /* * Perform kernel dump finalization: flush the compression stream, if necessary, * write the leading and trailing kernel dump headers now that we know the true * length of the dump, and optionally write the encryption key following the * leading header. */ int dump_finish(struct dumperinfo *di, struct kerneldumpheader *kdh) { int error; if (di->kdcomp != NULL) { error = compressor_flush(di->kdcomp->kdc_stream); if (error == EAGAIN) { /* We have residual data in di->blockbuf. */ error = dump_write(di, di->blockbuf, 0, di->dumpoff, di->blocksize); di->dumpoff += di->kdcomp->kdc_resid; di->kdcomp->kdc_resid = 0; } if (error != 0) return (error); /* * We now know the size of the compressed dump, so update the * header accordingly and recompute parity. */ kdh->dumplength = htod64(di->dumpoff - di->origdumpoff); kdh->parity = 0; kdh->parity = kerneldump_parity(kdh); compressor_reset(di->kdcomp->kdc_stream); } error = dump_write_headers(di, kdh); if (error != 0) return (error); (void)dump_write(di, NULL, 0, 0, 0); return (0); } void dump_init_header(const struct dumperinfo *di, struct kerneldumpheader *kdh, char *magic, uint32_t archver, uint64_t dumplen) { size_t dstsize; bzero(kdh, sizeof(*kdh)); strlcpy(kdh->magic, magic, sizeof(kdh->magic)); strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture)); kdh->version = htod32(KERNELDUMPVERSION); kdh->architectureversion = htod32(archver); kdh->dumplength = htod64(dumplen); kdh->dumpextent = kdh->dumplength; kdh->dumptime = htod64(time_second); #ifdef EKCD kdh->dumpkeysize = htod32(kerneldumpcrypto_dumpkeysize(di->kdcrypto)); #else kdh->dumpkeysize = 0; #endif kdh->blocksize = htod32(di->blocksize); strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname)); dstsize = sizeof(kdh->versionstring); if (strlcpy(kdh->versionstring, version, dstsize) >= dstsize) kdh->versionstring[dstsize - 2] = '\n'; if (panicstr != NULL) strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); if (di->kdcomp != NULL) kdh->compression = di->kdcomp->kdc_format; kdh->parity = kerneldump_parity(kdh); } #ifdef DDB DB_SHOW_COMMAND(panic, db_show_panic) { if (panicstr == NULL) db_printf("panicstr not set\n"); else db_printf("panic: %s\n", panicstr); } #endif Index: stable/12/sys/kern/kern_timeout.c =================================================================== --- stable/12/sys/kern/kern_timeout.c (revision 355609) +++ stable/12/sys/kern/kern_timeout.c (revision 355610) @@ -1,1675 +1,1676 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_callout_profiling.h" #include "opt_ddb.h" #if defined(__arm__) #include "opt_timer.h" #endif #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #ifdef SMP #include #endif #ifndef NO_EVENTTIMERS DPCPU_DECLARE(sbintime_t, hardclocktime); #endif SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *"); SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *"); #ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); static int avg_gcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0, "Average number of Giant callouts made per softclock call. Units = 1/1000"); static int avg_lockcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, "Average number of lock callouts made per softclock call. Units = 1/1000"); static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); static int avg_depth_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, "Average number of direct callouts examined per callout_process call. " "Units = 1/1000"); static int avg_lockcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " "callout_process call. Units = 1/1000"); static int avg_mpcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, 0, "Average number of MP direct callouts made per callout_process call. " "Units = 1/1000"); #endif static int ncallout; SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0, "Number of entries in callwheel and size of timeout() preallocation"); #ifdef RSS static int pin_default_swi = 1; static int pin_pcpu_swi = 1; #else static int pin_default_swi = 0; static int pin_pcpu_swi = 0; #endif SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi, 0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)"); SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi, 0, "Pin the per-CPU swis (except PCPU 0, which is also default"); /* * TODO: * allocate more timeout table slots when table overflows. */ -u_int callwheelsize, callwheelmask; +static u_int __read_mostly callwheelsize; +static u_int __read_mostly callwheelmask; /* * The callout cpu exec entities represent informations necessary for * describing the state of callouts currently running on the CPU and the ones * necessary for migrating callouts to the new callout cpu. In particular, * the first entry of the array cc_exec_entity holds informations for callout * running in SWI thread context, while the second one holds informations * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ struct cc_exec { struct callout *cc_curr; void (*cc_drain)(void *); #ifdef SMP void (*ce_migration_func)(void *); void *ce_migration_arg; int ce_migration_cpu; sbintime_t ce_migration_time; sbintime_t ce_migration_prec; #endif bool cc_cancel; bool cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. */ struct callout_cpu { struct mtx_padalign cc_lock; struct cc_exec cc_exec_entity[2]; struct callout *cc_next; struct callout *cc_callout; struct callout_list *cc_callwheel; struct callout_tailq cc_expireq; struct callout_slist cc_callfree; sbintime_t cc_firstevent; sbintime_t cc_lastscan; void *cc_cookie; u_int cc_bucket; u_int cc_inited; char cc_ktr_event_name[20]; }; #define callout_migrating(c) ((c)->c_iflags & CALLOUT_DFRMIGRATION) #define cc_exec_curr(cc, dir) cc->cc_exec_entity[dir].cc_curr #define cc_exec_drain(cc, dir) cc->cc_exec_entity[dir].cc_drain #define cc_exec_next(cc) cc->cc_next #define cc_exec_cancel(cc, dir) cc->cc_exec_entity[dir].cc_cancel #define cc_exec_waiting(cc, dir) cc->cc_exec_entity[dir].cc_waiting #ifdef SMP #define cc_migration_func(cc, dir) cc->cc_exec_entity[dir].ce_migration_func #define cc_migration_arg(cc, dir) cc->cc_exec_entity[dir].ce_migration_arg #define cc_migration_cpu(cc, dir) cc->cc_exec_entity[dir].ce_migration_cpu #define cc_migration_time(cc, dir) cc->cc_exec_entity[dir].ce_migration_time #define cc_migration_prec(cc, dir) cc->cc_exec_entity[dir].ce_migration_prec struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) #define CC_SELF() CC_CPU(PCPU_GET(cpuid)) #else struct callout_cpu cc_cpu; #define CC_CPU(cpu) &cc_cpu #define CC_SELF() &cc_cpu #endif #define CC_LOCK(cc) mtx_lock_spin(&(cc)->cc_lock) #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) -static int timeout_cpu; +static int __read_mostly timeout_cpu; static void callout_cpu_init(struct callout_cpu *cc, int cpu); static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: * cc_curr - If a callout is in progress, it is cc_curr. * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when * cc_curr is non-NULL. */ /* * Resets the execution entity tied to a specific callout cpu. */ static void cc_cce_cleanup(struct callout_cpu *cc, int direct) { cc_exec_curr(cc, direct) = NULL; cc_exec_cancel(cc, direct) = false; cc_exec_waiting(cc, direct) = false; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } /* * Checks if migration is requested by a specific callout cpu. */ static int cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP return (cc_migration_cpu(cc, direct) != CPUBLOCK); #else return (0); #endif } /* * Kernel low level callwheel initialization * called on the BSP during kernel startup. */ static void callout_callwheel_init(void *dummy) { struct callout_cpu *cc; /* * Calculate the size of the callout wheel and the preallocated * timeout() structures. * XXX: Clip callout to result of previous function of maxusers * maximum 384. This is still huge, but acceptable. */ memset(CC_CPU(curcpu), 0, sizeof(cc_cpu)); ncallout = imin(16 + maxproc + maxfiles, 18508); TUNABLE_INT_FETCH("kern.ncallout", &ncallout); /* * Calculate callout wheel size, should be next power of two higher * than 'ncallout'. */ callwheelsize = 1 << fls(ncallout); callwheelmask = callwheelsize - 1; /* * Fetch whether we're pinning the swi's or not. */ TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi); TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi); /* * Only BSP handles timeout(9) and receives a preallocation. * * XXX: Once all timeout(9) consumers are converted this can * be removed. */ timeout_cpu = PCPU_GET(cpuid); cc = CC_CPU(timeout_cpu); cc->cc_callout = malloc(ncallout * sizeof(struct callout), M_CALLOUT, M_WAITOK); callout_cpu_init(cc, timeout_cpu); } SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL); /* * Initialize the per-cpu callout structures. */ static void callout_cpu_init(struct callout_cpu *cc, int cpu) { struct callout *c; int i; mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); cc->cc_inited = 1; cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) * callwheelsize, M_CALLOUT, DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK); for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; for (i = 0; i < 2; i++) cc_cce_cleanup(cc, i); snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); if (cc->cc_callout == NULL) /* Only BSP handles timeout(9) */ return; for (i = 0; i < ncallout; i++) { c = &cc->cc_callout[i]; callout_init(c, 0); c->c_iflags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } } #ifdef SMP /* * Switches the cpu tied to a specific callout. * The function expects a locked incoming callout cpu and returns with * locked outcoming callout cpu. */ static struct callout_cpu * callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) { struct callout_cpu *new_cc; MPASS(c != NULL && cc != NULL); CC_LOCK_ASSERT(cc); /* * Avoid interrupts and preemption firing after the callout cpu * is blocked in order to avoid deadlocks as the new thread * may be willing to acquire the callout cpu lock. */ c->c_cpu = CPUBLOCK; spinlock_enter(); CC_UNLOCK(cc); new_cc = CC_CPU(new_cpu); CC_LOCK(new_cc); spinlock_exit(); c->c_cpu = new_cpu; return (new_cc); } #endif /* * Start standard softclock thread. */ static void start_softclock(void *dummy) { struct callout_cpu *cc; char name[MAXCOMLEN]; #ifdef SMP int cpu; struct intr_event *ie; #endif cc = CC_CPU(timeout_cpu); snprintf(name, sizeof(name), "clock (%d)", timeout_cpu); if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); if (pin_default_swi && (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) { printf("%s: timeout clock couldn't be pinned to cpu %d\n", __func__, timeout_cpu); } #ifdef SMP CPU_FOREACH(cpu) { if (cpu == timeout_cpu) continue; cc = CC_CPU(cpu); cc->cc_callout = NULL; /* Only BSP handles timeout(9). */ callout_cpu_init(cc, cpu); snprintf(name, sizeof(name), "clock (%d)", cpu); ie = NULL; if (swi_add(&ie, name, softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) { printf("%s: per-cpu clock couldn't be pinned to " "cpu %d\n", __func__, cpu); } } #endif } SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); #define CC_HASH_SHIFT 8 static inline u_int callout_hash(sbintime_t sbt) { return (sbt >> (32 - CC_HASH_SHIFT)); } static inline u_int callout_get_bucket(sbintime_t sbt) { return (callout_hash(sbt) & callwheelmask); } void callout_process(sbintime_t now) { struct callout *tmp, *tmpn; struct callout_cpu *cc; struct callout_list *sc; sbintime_t first, last, max, tmp_max; uint32_t lookahead; u_int firstb, lastb, nowb; #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); cc->cc_lastscan = now; nowb = callout_hash(now); /* Compute the last bucket and minimum time of the bucket after it. */ if (nowb == firstb) lookahead = (SBT_1S / 16); else if (nowb - firstb == 1) lookahead = (SBT_1S / 8); else lookahead = (SBT_1S / 2); first = last = now; first += (lookahead / 2); last += lookahead; last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); lastb = callout_hash(last) - 1; max = last; /* * Check if we wrapped around the entire wheel from the last scan. * In case, we need to scan entirely the wheel for pending callouts. */ if (lastb - firstb >= callwheelsize) { lastb = firstb + callwheelsize - 1; if (nowb - firstb >= callwheelsize) nowb = lastb; } /* Iterate callwheel from firstb to nowb and then up to lastb. */ do { sc = &cc->cc_callwheel[firstb & callwheelmask]; tmp = LIST_FIRST(sc); while (tmp != NULL) { /* Run the callout if present time within allowed. */ if (tmp->c_time <= now) { /* * Consumer told us the callout may be run * directly from hardware interrupt context. */ if (tmp->c_iflags & CALLOUT_DIRECT) { #ifdef CALLOUT_PROFILING ++depth_dir; #endif cc_exec_next(cc) = LIST_NEXT(tmp, c_links.le); cc->cc_bucket = firstb & callwheelmask; LIST_REMOVE(tmp, c_links.le); softclock_call_cc(tmp, cc, #ifdef CALLOUT_PROFILING &mpcalls_dir, &lockcalls_dir, NULL, #endif 1); tmp = cc_exec_next(cc); cc_exec_next(cc) = NULL; } else { tmpn = LIST_NEXT(tmp, c_links.le); LIST_REMOVE(tmp, c_links.le); TAILQ_INSERT_TAIL(&cc->cc_expireq, tmp, c_links.tqe); tmp->c_iflags |= CALLOUT_PROCESSED; tmp = tmpn; } continue; } /* Skip events from distant future. */ if (tmp->c_time >= max) goto next; /* * Event minimal time is bigger than present maximal * time, so it cannot be aggregated. */ if (tmp->c_time > last) { lastb = nowb; goto next; } /* Update first and last time, respecting this event. */ if (tmp->c_time < first) first = tmp->c_time; tmp_max = tmp->c_time + tmp->c_precision; if (tmp_max < last) last = tmp_max; next: tmp = LIST_NEXT(tmp, c_links.le); } /* Proceed with the next bucket. */ firstb++; /* * Stop if we looked after present time and found * some event we can't execute at now. * Stop if we looked far enough into the future. */ } while (((int)(firstb - lastb)) <= 0); cc->cc_firstevent = last; #ifndef NO_EVENTTIMERS cpu_new_callout(curcpu, last, first); #endif #ifdef CALLOUT_PROFILING avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; #endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. */ if (!TAILQ_EMPTY(&cc->cc_expireq)) swi_sched(cc->cc_cookie, 0); } static struct callout_cpu * callout_lock(struct callout *c) { struct callout_cpu *cc; int cpu; for (;;) { cpu = c->c_cpu; #ifdef SMP if (cpu == CPUBLOCK) { while (c->c_cpu == CPUBLOCK) cpu_spinwait(); continue; } #endif cc = CC_CPU(cpu); CC_LOCK(cc); if (cpu == c->c_cpu) break; CC_UNLOCK(cc); } return (cc); } static void callout_cc_add(struct callout *c, struct callout_cpu *cc, sbintime_t sbt, sbintime_t precision, void (*func)(void *), void *arg, int cpu, int flags) { int bucket; CC_LOCK_ASSERT(cc); if (sbt < cc->cc_lastscan) sbt = cc->cc_lastscan; c->c_arg = arg; c->c_iflags |= CALLOUT_PENDING; c->c_iflags &= ~CALLOUT_PROCESSED; c->c_flags |= CALLOUT_ACTIVE; if (flags & C_DIRECT_EXEC) c->c_iflags |= CALLOUT_DIRECT; c->c_func = func; c->c_time = sbt; c->c_precision = precision; bucket = callout_get_bucket(c->c_time); CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); if (cc->cc_bucket == bucket) cc_exec_next(cc) = c; #ifndef NO_EVENTTIMERS /* * Inform the eventtimers(4) subsystem there's a new callout * that has been inserted, but only if really required. */ if (SBT_MAX - c->c_time < c->c_precision) c->c_precision = SBT_MAX - c->c_time; sbt = c->c_time + c->c_precision; if (sbt < cc->cc_firstevent) { cc->cc_firstevent = sbt; cpu_new_callout(cpu, sbt, c->c_time); } #endif } static void callout_cc_del(struct callout *c, struct callout_cpu *cc) { if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0) return; c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct) { struct rm_priotracker tracker; void (*c_func)(void *); void *c_arg; struct lock_class *class; struct lock_object *c_lock; uintptr_t lock_status; int c_iflags; #ifdef SMP struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; int flags, new_cpu; sbintime_t new_prec, new_time; #endif #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbintime_t sbt1, sbt2; struct timespec ts2; static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static timeout_t *lastfunc; #endif KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING, ("softclock_call_cc: pend %p %x", c, c->c_iflags)); KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE, ("softclock_call_cc: act %p %x", c, c->c_flags)); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; lock_status = 0; if (c->c_flags & CALLOUT_SHAREDLOCK) { if (class == &lock_class_rm) lock_status = (uintptr_t)&tracker; else lock_status = 1; } c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_iflags = c->c_iflags; if (c->c_iflags & CALLOUT_LOCAL_ALLOC) c->c_iflags = CALLOUT_LOCAL_ALLOC; else c->c_iflags &= ~CALLOUT_PENDING; cc_exec_curr(cc, direct) = c; cc_exec_cancel(cc, direct) = false; cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, lock_status); /* * The callout may have been cancelled * while we switched locks. */ if (cc_exec_cancel(cc, direct)) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ cc_exec_cancel(cc, direct) = true; if (c_lock == &Giant.lock_object) { #ifdef CALLOUT_PROFILING (*gcalls)++; #endif CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { #ifdef CALLOUT_PROFILING (*lockcalls)++; #endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { #ifdef CALLOUT_PROFILING (*mpcalls)++; #endif CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running", "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); SDT_PROBE1(callout_execute, , , callout__start, c); c_func(c_arg); SDT_PROBE1(callout_execute, , , callout__end, c); THREAD_SLEEPING_OK(); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt2 = sbinuptime(); sbt2 -= sbt1; if (sbt2 > maxdt) { if (lastfunc != c_func || sbt2 > maxdt * 2) { ts2 = sbttots(sbt2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } maxdt = sbt2; lastfunc = c_func; } #endif KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle"); CTR1(KTR_CALLOUT, "callout %p finished", c); if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0) class->lc_unlock(c_lock); skip: CC_LOCK(cc); KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr")); cc_exec_curr(cc, direct) = NULL; if (cc_exec_drain(cc, direct)) { void (*drain)(void *); drain = cc_exec_drain(cc, direct); cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); drain(c_arg); CC_LOCK(cc); } if (cc_exec_waiting(cc, direct)) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ if (cc_cce_migrating(cc, direct)) { cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not * destroyed but that is not easy. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; } cc_exec_waiting(cc, direct) = false; CC_UNLOCK(cc); wakeup(&cc_exec_waiting(cc, direct)); CC_LOCK(cc); } else if (cc_cce_migrating(cc, direct)) { KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP /* * If the callout was scheduled for * migration just perform it now. */ new_cpu = cc_migration_cpu(cc, direct); new_time = cc_migration_time(cc, direct); new_prec = cc_migration_prec(cc, direct); new_func = cc_migration_func(cc, direct); new_arg = cc_migration_arg(cc, direct); cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed * but that is not easy. * * As first thing, handle deferred callout stops. */ if (!callout_migrating(c)) { CTR3(KTR_CALLOUT, "deferred cancelled %p func %p arg %p", c, new_func, new_arg); callout_cc_del(c, cc); return; } c->c_iflags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); flags = (direct) ? C_DIRECT_EXEC : 0; callout_cc_add(c, new_cc, new_time, new_prec, new_func, new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else panic("migration should not happen"); #endif } /* * If the current callout is locally allocated (from * timeout(9)) then put it on the freelist. * * Note: we need to check the cached copy of c_iflags because * if it was not local, then it's not safe to deref the * callout pointer. */ KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 || c->c_iflags == CALLOUT_LOCAL_ALLOC, ("corrupted callout")); if (c_iflags & CALLOUT_LOCAL_ALLOC) callout_cc_del(c, cc); } /* * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning * the BSD Callout and Timer Facilities" and modified slightly for inclusion * in FreeBSD by Justin T. Gibbs. The original work on the data structures * used in this implementation was published by G. Varghese and T. Lauck in * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for * the Efficient Implementation of a Timer Facility" in the Proceedings of * the 11th ACM Annual Symposium on Operating Systems Principles, * Austin, Texas Nov 1987. */ /* * Software (low priority) clock interrupt. * Run periodic events from timeout queue. */ void softclock(void *arg) { struct callout_cpu *cc; struct callout *c; #ifdef CALLOUT_PROFILING int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0; #endif cc = (struct callout_cpu *)arg; CC_LOCK(cc); while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); softclock_call_cc(c, cc, #ifdef CALLOUT_PROFILING &mpcalls, &lockcalls, &gcalls, #endif 0); #ifdef CALLOUT_PROFILING ++depth; #endif } #ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; #endif CC_UNLOCK(cc); } /* * timeout -- * Execute a function after a specified length of time. * * untimeout -- * Cancel previous timeout function call. * * callout_handle_init -- * Initialize a handle so that using it with untimeout is benign. * * See AT&T BCI Driver Reference Manual for specification. This * implementation differs from that one in that although an * identification value is returned from timeout, the original * arguments to timeout as well as the identifier are used to * identify entries for untimeout. */ struct callout_handle timeout(timeout_t *ftn, void *arg, int to_ticks) { struct callout_cpu *cc; struct callout *new; struct callout_handle handle; cc = CC_CPU(timeout_cpu); CC_LOCK(cc); /* Fill in the next free callout structure. */ new = SLIST_FIRST(&cc->cc_callfree); if (new == NULL) /* XXX Attempt to malloc first */ panic("timeout table full"); SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle); callout_reset(new, to_ticks, ftn, arg); handle.callout = new; CC_UNLOCK(cc); return (handle); } void untimeout(timeout_t *ftn, void *arg, struct callout_handle handle) { struct callout_cpu *cc; /* * Check for a handle that was initialized * by callout_handle_init, but never used * for a real timeout. */ if (handle.callout == NULL) return; cc = callout_lock(handle.callout); if (handle.callout->c_func == ftn && handle.callout->c_arg == arg) callout_stop(handle.callout); CC_UNLOCK(cc); } void callout_handle_init(struct callout_handle *handle) { handle->callout = NULL; } void callout_when(sbintime_t sbt, sbintime_t precision, int flags, sbintime_t *res, sbintime_t *prec_res) { sbintime_t to_sbt, to_pr; if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) { *res = sbt; *prec_res = precision; return; } if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt) sbt = tick_sbt; if ((flags & C_HARDCLOCK) != 0 || #ifdef NO_EVENTTIMERS sbt >= sbt_timethreshold) { to_sbt = getsbinuptime(); /* Add safety belt for the case of hz > 1000. */ to_sbt += tc_tick_sbt - tick_sbt; #else sbt >= sbt_tickthreshold) { /* * Obtain the time of the last hardclock() call on * this CPU directly from the kern_clocksource.c. * This value is per-CPU, but it is equal for all * active ones. */ #ifdef __LP64__ to_sbt = DPCPU_GET(hardclocktime); #else spinlock_enter(); to_sbt = DPCPU_GET(hardclocktime); spinlock_exit(); #endif #endif if (cold && to_sbt == 0) to_sbt = sbinuptime(); if ((flags & C_HARDCLOCK) == 0) to_sbt += tick_sbt; } else to_sbt = sbinuptime(); if (SBT_MAX - to_sbt < sbt) to_sbt = SBT_MAX; else to_sbt += sbt; *res = to_sbt; to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : sbt >> C_PRELGET(flags)); *prec_res = to_pr > precision ? to_pr : precision; } /* * New interface; clients allocate their own callout structures. * * callout_reset() - establish or change a timeout * callout_stop() - disestablish a timeout * callout_init() - initialize a callout structure so that it can * safely be passed to callout_reset() and callout_stop() * * defines three convenience macros: * * callout_active() - returns truth if callout has not been stopped, * drained, or deactivated since the last time the callout was * reset. * callout_pending() - returns truth if callout is still waiting for timeout * callout_deactivate() - marks the callout as having been serviced */ int callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec, void (*ftn)(void *), void *arg, int cpu, int flags) { sbintime_t to_sbt, precision; struct callout_cpu *cc; int cancelled, direct; int ignore_cpu=0; cancelled = 0; if (cpu == -1) { ignore_cpu = 1; } else if ((cpu >= MAXCPU) || ((CC_CPU(cpu))->cc_inited == 0)) { /* Invalid CPU spec */ panic("Invalid CPU in callout %d", cpu); } callout_when(sbt, prec, flags, &to_sbt, &precision); /* * This flag used to be added by callout_cc_add, but the * first time you call this we could end up with the * wrong direct flag if we don't do it before we add. */ if (flags & C_DIRECT_EXEC) { direct = 1; } else { direct = 0; } KASSERT(!direct || c->c_lock == NULL, ("%s: direct callout %p has lock", __func__, c)); cc = callout_lock(c); /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced or handle the case where the user does * not care. */ if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) || ignore_cpu) { cpu = c->c_cpu; } if (cc_exec_curr(cc, direct) == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ if (c->c_lock != NULL && !cc_exec_cancel(cc, direct)) cancelled = cc_exec_cancel(cc, direct) = true; if (cc_exec_waiting(cc, direct) || cc_exec_drain(cc, direct)) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. */ CTR4(KTR_CALLOUT, "%s %p func %p arg %p", cancelled ? "cancelled" : "failed to cancel", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (cancelled); } #ifdef SMP if (callout_migrating(c)) { /* * This only occurs when a second callout_reset_sbt_on * is made after a previous one moved it into * deferred migration (below). Note we do *not* change * the prev_cpu even though the previous target may * be different. */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; cancelled = 1; CC_UNLOCK(cc); return (cancelled); } #endif } if (c->c_iflags & CALLOUT_PENDING) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } cancelled = 1; c->c_iflags &= ~ CALLOUT_PENDING; c->c_flags &= ~ CALLOUT_ACTIVE; } #ifdef SMP /* * If the callout must migrate try to perform it immediately. * If the callout is currently running, just defer the migration * to a more appropriate moment. */ if (c->c_cpu != cpu) { if (cc_exec_curr(cc, direct) == c) { /* * Pending will have been removed since we are * actually executing the callout on another * CPU. That callout should be waiting on the * lock the caller holds. If we set both * active/and/pending after we return and the * lock on the executing callout proceeds, it * will then see pending is true and return. * At the return from the actual callout execution * the migration will occur in softclock_call_cc * and this new callout will be placed on the * new CPU via a call to callout_cpu_switch() which * will get the lock on the right CPU followed * by a call callout_cc_add() which will add it there. * (see above in softclock_call_cc()). */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING); c->c_flags |= CALLOUT_ACTIVE; CTR6(KTR_CALLOUT, "migration of %p func %p arg %p in %d.%08x to %u deferred", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } cc = callout_cpu_switch(c, cc, cpu); } #endif callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); } /* * Common idioms that can be optimized in the future. */ int callout_schedule_on(struct callout *c, int to_ticks, int cpu) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu); } int callout_schedule(struct callout *c, int to_ticks) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu); } int _callout_stop_safe(struct callout *c, int flags, void (*drain)(void *)) { struct callout_cpu *cc, *old_cc; struct lock_class *class; int direct, sq_locked, use_lock; int cancelled, not_on_a_list; if ((flags & CS_DRAIN) != 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock, "calling %s", __func__); /* * Some old subsystems don't hold Giant while running a callout_stop(), * so just discard this check for the moment. */ if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) { if (c->c_lock == &Giant.lock_object) use_lock = mtx_owned(&Giant); else { use_lock = 1; class = LOCK_CLASS(c->c_lock); class->lc_assert(c->c_lock, LA_XLOCKED); } } else use_lock = 0; if (c->c_iflags & CALLOUT_DIRECT) { direct = 1; } else { direct = 0; } sq_locked = 0; old_cc = NULL; again: cc = callout_lock(c); if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) == (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) && ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) { /* * Special case where this slipped in while we * were migrating *as* the callout is about to * execute. The caller probably holds the lock * the callout wants. * * Get rid of the migration first. Then set * the flag that tells this code *not* to * try to remove it from any lists (its not * on one yet). When the callout wheel runs, * it will ignore this callout. */ c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; not_on_a_list = 1; } else { not_on_a_list = 0; } /* * If the callout was migrating while the callout cpu lock was * dropped, just drop the sleepqueue lock and check the states * again. */ if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); sleepq_release(&cc_exec_waiting(old_cc, direct)); sq_locked = 0; old_cc = NULL; goto again; #else panic("migration should not happen"); #endif } /* * If the callout is running, try to stop it or drain it. */ if (cc_exec_curr(cc, direct) == c) { /* * Succeed we to stop it or not, we must clear the * active flag - this is what API users expect. If we're * draining and the callout is currently executing, first wait * until it finishes. */ if ((flags & CS_DRAIN) == 0) c->c_flags &= ~CALLOUT_ACTIVE; if ((flags & CS_DRAIN) != 0) { /* * The current callout is running (or just * about to run) and blocking is allowed, so * just wait for the current invocation to * finish. */ while (cc_exec_curr(cc, direct) == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid * a LOR between cc_lock and sleepqueue * chain spinlocks. This piece of code * emulates a msleep_spin() call actually. * * If we already have the sleepqueue chain * locked, then we can safely block. If we * don't already have it locked, however, * we have to drop the cc_lock to lock * it. This opens several races, so we * restart at the beginning once we have * both locks. If nothing has changed, then * we will end up back here with sq_locked * set. */ if (!sq_locked) { CC_UNLOCK(cc); sleepq_lock( &cc_exec_waiting(cc, direct)); sq_locked = 1; old_cc = cc; goto again; } /* * Migration could be cancelled here, but * as long as it is still not sure when it * will be packed up, just let softclock() * take care of it. */ cc_exec_waiting(cc, direct) = true; DROP_GIANT(); CC_UNLOCK(cc); sleepq_add( &cc_exec_waiting(cc, direct), &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); sleepq_wait( &cc_exec_waiting(cc, direct), 0); sq_locked = 0; old_cc = NULL; /* Reacquire locks previously released. */ PICKUP_GIANT(); CC_LOCK(cc); } c->c_flags &= ~CALLOUT_ACTIVE; } else if (use_lock && !cc_exec_cancel(cc, direct) && (drain == NULL)) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout * and return. After our caller drops the * lock, the callout will be skipped in * softclock(). This *only* works with a * callout_stop() *not* callout_drain() or * callout_async_drain(). */ cc_exec_cancel(cc, direct) = true; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); if (callout_migrating(c)) { c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); return (1); } else if (callout_migrating(c)) { /* * The callout is currently being serviced * and the "next" callout is scheduled at * its completion with a migration. We remove * the migration flag so it *won't* get rescheduled, * but we can't stop the one thats running so * we return 0. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP /* * We can't call cc_cce_cleanup here since * if we do it will remove .ce_curr and * its still running. This will prevent a * reschedule of the callout when the * execution completes. */ cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { cc_exec_drain(cc, direct) = drain; } CC_UNLOCK(cc); return ((flags & CS_EXECUTING) != 0); } CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { cc_exec_drain(cc, direct) = drain; } KASSERT(!sq_locked, ("sleepqueue chain still locked")); cancelled = ((flags & CS_EXECUTING) != 0); } else cancelled = 1; if (sq_locked) sleepq_release(&cc_exec_waiting(cc, direct)); if ((c->c_iflags & CALLOUT_PENDING) == 0) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); /* * For not scheduled and not executing callout return * negative value. */ if (cc_exec_curr(cc, direct) != c) cancelled = -1; CC_UNLOCK(cc); return (cancelled); } c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); if (not_on_a_list == 0) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } } callout_cc_del(c, cc); CC_UNLOCK(cc); return (cancelled); } void callout_init(struct callout *c, int mpsafe) { bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; c->c_iflags = CALLOUT_RETURNUNLOCKED; } else { c->c_lock = &Giant.lock_object; c->c_iflags = 0; } c->c_cpu = timeout_cpu; } void _callout_init_lock(struct callout *c, struct lock_object *lock, int flags) { bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, ("callout_init_lock: bad flags %d", flags)); KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class", __func__)); c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); c->c_cpu = timeout_cpu; } #ifdef APM_FIXUP_CALLTODO /* * Adjust the kernel calltodo timeout list. This routine is used after * an APM resume to recalculate the calltodo timer list values with the * number of hz's we have been sleeping. The next hardclock() will detect * that there are fired timers and run softclock() to execute them. * * Please note, I have not done an exhaustive analysis of what code this * might break. I am motivated to have my select()'s and alarm()'s that * have expired during suspend firing upon resume so that the applications * which set the timer can do the maintanence the timer was for as close * as possible to the originally intended time. Testing this code for a * week showed that resuming from a suspend resulted in 22 to 25 timers * firing, which seemed independent on whether the suspend was 2 hours or * 2 days. Your milage may vary. - Ken Key */ void adjust_timeout_calltodo(struct timeval *time_change) { struct callout *p; unsigned long delta_ticks; /* * How many ticks were we asleep? * (stolen from tvtohz()). */ /* Don't do anything */ if (time_change->tv_sec < 0) return; else if (time_change->tv_sec <= LONG_MAX / 1000000) delta_ticks = howmany(time_change->tv_sec * 1000000 + time_change->tv_usec, tick) + 1; else if (time_change->tv_sec <= LONG_MAX / hz) delta_ticks = time_change->tv_sec * hz + howmany(time_change->tv_usec, tick) + 1; else delta_ticks = LONG_MAX; if (delta_ticks > INT_MAX) delta_ticks = INT_MAX; /* * Now rip through the timer calltodo list looking for timers * to expire. */ /* don't collide with softclock() */ CC_LOCK(cc); for (p = calltodo.c_next; p != NULL; p = p->c_next) { p->c_time -= delta_ticks; /* Break if the timer had more time on it than delta_ticks */ if (p->c_time > 0) break; /* take back the ticks the timer didn't use (p->c_time <= 0) */ delta_ticks = -p->c_time; } CC_UNLOCK(cc); return; } #endif /* APM_FIXUP_CALLTODO */ static int flssbt(sbintime_t sbt) { sbt += (uint64_t)sbt >> 1; if (sizeof(long) >= sizeof(sbintime_t)) return (flsl(sbt)); if (sbt >= SBT_1S) return (flsl(((uint64_t)sbt) >> 32) + 32); return (flsl(sbt)); } /* * Dump immediate statistic snapshot of the scheduled callouts. */ static int sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) { struct callout *tmp; struct callout_cpu *cc; struct callout_list *sc; sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; int ct[64], cpr[64], ccpbk[32]; int error, val, i, count, tcum, pcum, maxc, c, medc; #ifdef SMP int cpu; #endif val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); count = maxc = 0; st = spr = maxt = maxpr = 0; bzero(ccpbk, sizeof(ccpbk)); bzero(ct, sizeof(ct)); bzero(cpr, sizeof(cpr)); now = sbinuptime(); #ifdef SMP CPU_FOREACH(cpu) { cc = CC_CPU(cpu); #else cc = CC_CPU(timeout_cpu); #endif CC_LOCK(cc); for (i = 0; i < callwheelsize; i++) { sc = &cc->cc_callwheel[i]; c = 0; LIST_FOREACH(tmp, sc, c_links.le) { c++; t = tmp->c_time - now; if (t < 0) t = 0; st += t / SBT_1US; spr += tmp->c_precision / SBT_1US; if (t > maxt) maxt = t; if (tmp->c_precision > maxpr) maxpr = tmp->c_precision; ct[flssbt(t)]++; cpr[flssbt(tmp->c_precision)]++; } if (c > maxc) maxc = c; ccpbk[fls(c + c / 2)]++; count += c; } CC_UNLOCK(cc); #ifdef SMP } #endif for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) tcum += ct[i]; medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) pcum += cpr[i]; medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, c = 0; i < 32 && c < count / 2; i++) c += ccpbk[i]; medc = (i >= 2) ? (1 << (i - 2)) : 0; printf("Scheduled callouts statistic snapshot:\n"); printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", medc, count / callwheelsize / mp_ncpus, (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, maxc); printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, (st / count) / 1000000, (st / count) % 1000000, maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, (spr / count) / 1000000, (spr / count) % 1000000, maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); printf(" Distribution: \tbuckets\t time\t tcum\t" " prec\t pcum\n"); for (i = 0, tcum = pcum = 0; i < 64; i++) { if (ct[i] == 0 && cpr[i] == 0) continue; t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; tcum += ct[i]; pcum += cpr[i]; printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, i - 1 - (32 - CC_HASH_SHIFT), ct[i], tcum, cpr[i], pcum); } return (error); } SYSCTL_PROC(_kern, OID_AUTO, callout_stat, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_callout_stat, "I", "Dump immediate statistic snapshot of the scheduled callouts"); #ifdef DDB static void _show_callout(struct callout *c) { db_printf("callout %p\n", c); #define C_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, c->e); db_printf(" &c_links = %p\n", &(c->c_links)); C_DB_PRINTF("%" PRId64, c_time); C_DB_PRINTF("%" PRId64, c_precision); C_DB_PRINTF("%p", c_arg); C_DB_PRINTF("%p", c_func); C_DB_PRINTF("%p", c_lock); C_DB_PRINTF("%#x", c_flags); C_DB_PRINTF("%#x", c_iflags); C_DB_PRINTF("%d", c_cpu); #undef C_DB_PRINTF } DB_SHOW_COMMAND(callout, db_show_callout) { if (!have_addr) { db_printf("usage: show callout \n"); return; } _show_callout((struct callout *)addr); } #endif /* DDB */ Index: stable/12/sys/kern/sched_4bsd.c =================================================================== --- stable/12/sys/kern/sched_4bsd.c (revision 355609) +++ stable/12/sys/kern/sched_4bsd.c (revision 355610) @@ -1,1768 +1,1768 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include -int dtrace_vtime_active; +int __read_mostly dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif /* * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in * the range 100-256 Hz (approximately). */ #define ESTCPULIM(e) \ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) #ifdef SMP #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) #else #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ #endif #define NICE_WEIGHT 1 /* Priorities per nice level. */ #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) /* * The schedulable entity that runs a context. * This is an extension to the thread structure and is tailored to * the requirements of this scheduler. * All fields are protected by the scheduler lock. */ struct td_sched { fixpt_t ts_pctcpu; /* %cpu during p_swtime. */ u_int ts_estcpu; /* Estimated cpu utilization. */ int ts_cpticks; /* Ticks of cpu time. */ int ts_slptime; /* Seconds !RUNNING. */ int ts_slice; /* Remaining part of time slice. */ int ts_flags; struct runq *ts_runq; /* runq the thread is currently on */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in td_flags */ #define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */ #define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* flags kept in ts_flags */ #define TSF_AFFINITY 0x0001 /* Has a non-"full" CPU set. */ #define SKE_RUNQ_PCPU(ts) \ ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size"); static struct mtx sched_lock; static int realstathz = 127; /* stathz is sometimes 0 and run off of hz. */ static int sched_tdcnt; /* Total runnable threads in the system. */ static int sched_slice = 12; /* Thread run time before rescheduling. */ static void setup_runqs(void); static void schedcpu(void); static void schedcpu_thread(void); static void sched_priority(struct thread *td, u_char prio); static void sched_setup(void *dummy); static void maybe_resched(struct thread *td); static void updatepri(struct thread *td); static void resetpriority(struct thread *td); static void resetpriority_thread(struct thread *td); #ifdef SMP static int sched_pickcpu(struct thread *td); static int forward_wakeup(int cpunum); static void kick_other_cpu(int pri, int cpuid); #endif static struct kproc_desc sched_kp = { "schedcpu", schedcpu_thread, NULL }; SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start, &sched_kp); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); /* * Global run queue. */ static struct runq runq; #ifdef SMP /* * Per-CPU run queues */ static struct runq runq_pcpu[MAXCPU]; long runq_length[MAXCPU]; static cpuset_t idle_cpus_mask; #endif struct pcpuidlestat { u_int idlecalls; u_int oldidlecalls; }; DPCPU_DEFINE_STATIC(struct pcpuidlestat, idlestat); static void setup_runqs(void) { #ifdef SMP int i; for (i = 0; i < MAXCPU; ++i) runq_init(&runq_pcpu[i]); #endif runq_init(&runq); } static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); #ifdef SMP /* Enable forwarding of wakeups to all other cpus */ static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP"); static int runq_fuzz = 1; SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); static int forward_wakeup_enabled = 1; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, &forward_wakeup_enabled, 0, "Forwarding of wakeup to idle CPUs"); static int forward_wakeups_requested = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, &forward_wakeups_requested, 0, "Requests for Forwarding of wakeup to idle CPUs"); static int forward_wakeups_delivered = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, &forward_wakeups_delivered, 0, "Completed Forwarding of wakeup to idle CPUs"); static int forward_wakeup_use_mask = 1; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, &forward_wakeup_use_mask, 0, "Use the mask of idle cpus"); static int forward_wakeup_use_loop = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, &forward_wakeup_use_loop, 0, "Use a loop to find idle cpus"); #endif #if 0 static int sched_followon = 0; SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, &sched_followon, 0, "allow threads to share a quantum"); #endif SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); static __inline void sched_load_add(void) { sched_tdcnt++; KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt); } static __inline void sched_load_rem(void) { sched_tdcnt--; KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt); } /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ static void maybe_resched(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; } /* * This function is called when a thread is about to be put on run queue * because it has been made runnable or its priority has been adjusted. It * determines if the new thread should preempt the current thread. If so, * it sets td_owepreempt to request a preemption. */ int maybe_preempt(struct thread *td) { #ifdef PREEMPTION struct thread *ctd; int cpri, pri; /* * The new thread should not preempt the current thread if any of the * following conditions are true: * * - The kernel is in the throes of crashing (panicstr). * - The current thread has a higher (numerically lower) or * equivalent priority. Note that this prevents curthread from * trying to preempt to itself. * - The current thread has an inhibitor set or is in the process of * exiting. In this case, the current thread is about to switch * out anyways, so there's no point in preempting. If we did, * the current thread would not be properly resumed as well, so * just avoid that whole landmine. * - If the new thread's priority is not a realtime priority and * the current thread's priority is not an idle priority and * FULL_PREEMPTION is disabled. * * If all of these conditions are false, but the current thread is in * a nested critical section, then we have to defer the preemption * until we exit the critical section. Otherwise, switch immediately * to the new thread. */ ctd = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("maybe_preempt: trying to run inhibited thread")); pri = td->td_priority; cpri = ctd->td_priority; if (panicstr != NULL || pri >= cpri /* || dumping */ || TD_IS_INHIBITED(ctd)) return (0); #ifndef FULL_PREEMPTION if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) return (0); #endif CTR0(KTR_PROC, "maybe_preempt: scheduling preemption"); ctd->td_owepreempt = 1; return (1); #else return (0); #endif } /* * Constants for digital decay and forget: * 90% of (ts_estcpu) usage in 5 * loadav time * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously. * * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such * that the following for loop: * for (i = 0; i < (5 * loadavg); i++) * ts_estcpu *= decay; * will compute * ts_estcpu *= 0.1; * for all values of loadavg: * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * The system computes decay as: * decay = (2 * loadavg) / (2 * loadavg + 1) * * We wish to prove that the system's computation of decay * will always fulfill the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * For x close to zero, ln(1+x) =~ x, since * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * ln(.1) =~ -2.30 * * Proof of (1): * Solve (factor)**(power) =~ .1 given power (5*loadav): * solving for factor, * ln(factor) =~ (-2.30/5*loadav), or * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED * * Proof of (2): * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): * solving for power, * power*ln(b/(b+1)) =~ -2.30, or * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED * * Actual power values for the implemented algorithm are as follows: * loadav: 1 2 3 4 * power: 5.68 10.32 14.94 19.55 */ /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ #define loadfactor(loadav) (2 * (loadav)) #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); /* * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). * * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). * * If you don't want to bother with the faster/more-accurate formula, you * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate * (more general) method of calculating the %age of CPU used by a process. */ #define CCPU_SHIFT 11 /* * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ /* ARGSUSED */ static void schedcpu(void) { fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); struct thread *td; struct proc *p; struct td_sched *ts; int awake; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { awake = 0; ts = td_get_sched(td); thread_lock(td); /* * Increment sleep time (if sleeping). We * ignore overflow, as above. */ /* * The td_sched slptimes are not touched in wakeup * because the thread may not HAVE everything in * memory? XXX I think this is out of date. */ if (TD_ON_RUNQ(td)) { awake = 1; td->td_flags &= ~TDF_DIDRUN; } else if (TD_IS_RUNNING(td)) { awake = 1; /* Do not clear TDF_DIDRUN */ } else if (td->td_flags & TDF_DIDRUN) { awake = 1; td->td_flags &= ~TDF_DIDRUN; } /* * ts_pctcpu is only for ps and ttyinfo(). */ ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT; /* * If the td_sched has been idle the entire second, * stop recalculating its priority until * it wakes up. */ if (ts->ts_cpticks != 0) { #if (FSHIFT >= CCPU_SHIFT) ts->ts_pctcpu += (realstathz == 100) ? ((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else ts->ts_pctcpu += ((FSCALE - ccpu) * (ts->ts_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif ts->ts_cpticks = 0; } /* * If there are ANY running threads in this process, * then don't count it as sleeping. * XXX: this is broken. */ if (awake) { if (ts->ts_slptime > 1) { /* * In an ideal world, this should not * happen, because whoever woke us * up from the long sleep should have * unwound the slptime and reset our * priority before we run at the stale * priority. Should KASSERT at some * point when all the cases are fixed. */ updatepri(td); } ts->ts_slptime = 0; } else ts->ts_slptime++; if (ts->ts_slptime > 1) { thread_unlock(td); continue; } ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu); resetpriority(td); resetpriority_thread(td); thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } /* * Main loop for a kthread that executes schedcpu once a second. */ static void schedcpu_thread(void) { for (;;) { schedcpu(); pause("-", hz); } } /* * Recalculate the priority of a process after it has slept for a while. * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at * least six times the loadfactor will decay ts_estcpu to zero. */ static void updatepri(struct thread *td) { struct td_sched *ts; fixpt_t loadfac; unsigned int newcpu; ts = td_get_sched(td); loadfac = loadfactor(averunnable.ldavg[0]); if (ts->ts_slptime > 5 * loadfac) ts->ts_estcpu = 0; else { newcpu = ts->ts_estcpu; ts->ts_slptime--; /* was incremented in schedcpu() */ while (newcpu && --ts->ts_slptime) newcpu = decay_cpu(loadfac, newcpu); ts->ts_estcpu = newcpu; } } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. */ static void resetpriority(struct thread *td) { u_int newpriority; if (td->td_pri_class != PRI_TIMESHARE) return; newpriority = PUSER + td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT + NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN); newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), PRI_MAX_TIMESHARE); sched_user_prio(td, newpriority); } /* * Update the thread's priority when the associated process's user * priority changes. */ static void resetpriority_thread(struct thread *td) { /* Only change threads with a time sharing user priority. */ if (td->td_priority < PRI_MIN_TIMESHARE || td->td_priority > PRI_MAX_TIMESHARE) return; /* XXX the whole needresched thing is broken, but not silly. */ maybe_resched(td); sched_prio(td, td->td_user_pri); } /* ARGSUSED */ static void sched_setup(void *dummy) { setup_runqs(); /* Account for thread0. */ sched_load_add(); } /* * This routine determines time constants after stathz and hz are setup. */ static void sched_initticks(void *dummy) { realstathz = stathz ? stathz : hz; sched_slice = realstathz / 10; /* ~100ms */ hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); } /* External interfaces start here */ /* * Very early in the boot some setup of scheduler-specific * parts of proc0 and of some scheduler resources needs to be done. * Called from: * proc0_init() */ void schedinit(void) { /* * Set up the scheduler specific parts of thread0. */ thread0.td_lock = &sched_lock; td_get_sched(&thread0)->ts_slice = sched_slice; mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); } int sched_runnable(void) { #ifdef SMP return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); #else return runq_check(&runq); #endif } int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * We adjust the priority of the current process. The priority of a * process gets worse as it accumulates CPU time. The cpu usage * estimator (ts_estcpu) is increased here. resetpriority() will * compute a different priority each time ts_estcpu increases by * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached). The * cpu usage estimator ramps up quite quickly when the process is * running (linearly), and decays away exponentially, at a rate which * is proportionally slower when the system is busy. The basic * principle is that the system will 90% forget that the process used * a lot of CPU time in 5 * loadav seconds. This causes the system to * favor processes which haven't run much recently, and to round-robin * among other processes. */ void sched_clock(struct thread *td) { struct pcpuidlestat *stat; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); ts->ts_cpticks++; ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1); if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { resetpriority(td); resetpriority_thread(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) { ts->ts_slice = sched_slice; td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; } stat = DPCPU_PTR(idlestat); stat->oldidlecalls = stat->idlecalls; stat->idlecalls = 0; } /* * Charge child's scheduling CPU usage to parent. */ void sched_exit(struct proc *p, struct thread *td) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit", "prio:%d", td->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); } void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit", "prio:%d", child->td_priority); thread_lock(td); td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu + td_get_sched(child)->ts_estcpu); thread_unlock(td); thread_lock(child); if ((child->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); thread_unlock(child); } void sched_fork(struct thread *td, struct thread *childtd) { sched_fork_thread(td, childtd); } void sched_fork_thread(struct thread *td, struct thread *childtd) { struct td_sched *ts, *tsc; childtd->td_oncpu = NOCPU; childtd->td_lastcpu = NOCPU; childtd->td_lock = &sched_lock; childtd->td_cpuset = cpuset_ref(td->td_cpuset); childtd->td_domain.dr_policy = td->td_cpuset->cs_domain; childtd->td_priority = childtd->td_base_pri; ts = td_get_sched(childtd); bzero(ts, sizeof(*ts)); tsc = td_get_sched(td); ts->ts_estcpu = tsc->ts_estcpu; ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY); ts->ts_slice = 1; } void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); resetpriority(td); resetpriority_thread(td); thread_unlock(td); } } void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_pri_class = class; } /* * Adjust the priority of a thread. */ static void sched_priority(struct thread *td, u_char prio) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio > td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; td->td_priority = prio; if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) { sched_rem(td); sched_add(td, SRQ_BORING); } } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regulary priority is less * important than prio the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_prio(td, base_pri); } else sched_lend_prio(td, prio); } void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't ever * lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } void sched_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) td->td_flags |= TDF_NEEDRESCHED; } void sched_sleep(struct thread *td, int pri) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; td_get_sched(td)->ts_slptime = 0; if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, pri); if (TD_IS_SUSPENDED(td) || pri >= PSOCK) td->td_flags |= TDF_CANSWAP; } void sched_switch(struct thread *td, struct thread *newtd, int flags) { struct mtx *tmtx; struct td_sched *ts; struct proc *p; int preempted; tmtx = NULL; ts = td_get_sched(td); p = td->td_proc; THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Switch to the sched lock to fix things up and pick * a new thread. * Block the td_lock in order to avoid breaking the critical path. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); tmtx = thread_lock_block(td); } if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); td->td_lastcpu = td->td_oncpu; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND); td->td_owepreempt = 0; td->td_oncpu = NOCPU; /* * At the last moment, if this thread is still marked RUNNING, * then put it back on the run queue as it has not been suspended * or stopped or any thing else similar. We never put the idle * threads on the run queue, however. */ if (td->td_flags & TDF_IDLETD) { TD_SET_CAN_RUN(td); #ifdef SMP CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask); #endif } else { if (TD_IS_RUNNING(td)) { /* Put us back on the run queue. */ sched_add(td, preempted ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING); } } if (newtd) { /* * The thread we are about to run needs to be counted * as if it had been added to the run queue and selected. * It came from: * * A preemption * * An upcall * * A followon */ KASSERT((newtd->td_inhibitors == 0), ("trying to run inhibited thread")); newtd->td_flags |= TDF_DIDRUN; TD_SET_RUNNING(newtd); if ((newtd->td_flags & TDF_NOLOAD) == 0) sched_load_add(); } else { newtd = choosethread(); MPASS(newtd->td_lock == &sched_lock); } #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); /* I feel sleepy */ lock_profile_release_lock(&sched_lock.lock_object); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock); lock_profile_obtain_lock_success(&sched_lock.lock_object, 0, 0, __FILE__, __LINE__); /* * Where am I? What year is it? * We are in the same thread that went to sleep above, * but any amount of time may have passed. All our context * will still be available as will local variables. * PCPU values however may have changed as we may have * changed CPU so don't trust cached values of them. * New threads will go to fork_exit() instead of here * so if you change things here you may need to change * things there too. * * If the thread above was exiting it will never wake * up again here, so either it has saved everything it * needed to, or the thread_wait() or wait() will * need to reap it. */ SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else SDT_PROBE0(sched, , , remain__cpu); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); #ifdef SMP if (td->td_flags & TDF_IDLETD) CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask); #endif sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); MPASS(td->td_lock == &sched_lock); } void sched_wakeup(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); td->td_flags &= ~TDF_CANSWAP; if (ts->ts_slptime > 1) { updatepri(td); resetpriority(td); } td->td_slptick = 0; ts->ts_slptime = 0; ts->ts_slice = sched_slice; sched_add(td, SRQ_BORING); } #ifdef SMP static int forward_wakeup(int cpunum) { struct pcpu *pc; cpuset_t dontuse, map, map2; u_int id, me; int iscpuset; mtx_assert(&sched_lock, MA_OWNED); CTR0(KTR_RUNQ, "forward_wakeup()"); if ((!forward_wakeup_enabled) || (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) return (0); if (!smp_started || panicstr) return (0); forward_wakeups_requested++; /* * Check the idle mask we received against what we calculated * before in the old version. */ me = PCPU_GET(cpuid); /* Don't bother if we should be doing it ourself. */ if (CPU_ISSET(me, &idle_cpus_mask) && (cpunum == NOCPU || me == cpunum)) return (0); CPU_SETOF(me, &dontuse); CPU_OR(&dontuse, &stopped_cpus); CPU_OR(&dontuse, &hlt_cpus_mask); CPU_ZERO(&map2); if (forward_wakeup_use_loop) { STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { id = pc->pc_cpuid; if (!CPU_ISSET(id, &dontuse) && pc->pc_curthread == pc->pc_idlethread) { CPU_SET(id, &map2); } } } if (forward_wakeup_use_mask) { map = idle_cpus_mask; CPU_NAND(&map, &dontuse); /* If they are both on, compare and use loop if different. */ if (forward_wakeup_use_loop) { if (CPU_CMP(&map, &map2)) { printf("map != map2, loop method preferred\n"); map = map2; } } } else { map = map2; } /* If we only allow a specific CPU, then mask off all the others. */ if (cpunum != NOCPU) { KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); iscpuset = CPU_ISSET(cpunum, &map); if (iscpuset == 0) CPU_ZERO(&map); else CPU_SETOF(cpunum, &map); } if (!CPU_EMPTY(&map)) { forward_wakeups_delivered++; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { id = pc->pc_cpuid; if (!CPU_ISSET(id, &map)) continue; if (cpu_idle_wakeup(pc->pc_cpuid)) CPU_CLR(id, &map); } if (!CPU_EMPTY(&map)) ipi_selected(map, IPI_AST); return (1); } if (cpunum == NOCPU) printf("forward_wakeup: Idle processor not found\n"); return (0); } static void kick_other_cpu(int pri, int cpuid) { struct pcpu *pcpu; int cpri; pcpu = pcpu_find(cpuid); if (CPU_ISSET(cpuid, &idle_cpus_mask)) { forward_wakeups_delivered++; if (!cpu_idle_wakeup(cpuid)) ipi_cpu(cpuid, IPI_AST); return; } cpri = pcpu->pc_curthread->td_priority; if (pri >= cpri) return; #if defined(IPI_PREEMPTION) && defined(PREEMPTION) #if !defined(FULL_PREEMPTION) if (pri <= PRI_MAX_ITHD) #endif /* ! FULL_PREEMPTION */ { ipi_cpu(cpuid, IPI_PREEMPT); return; } #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */ pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED; ipi_cpu(cpuid, IPI_AST); return; } #endif /* SMP */ #ifdef SMP static int sched_pickcpu(struct thread *td) { int best, cpu; mtx_assert(&sched_lock, MA_OWNED); if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu)) best = td->td_lastcpu; else best = NOCPU; CPU_FOREACH(cpu) { if (!THREAD_CAN_SCHED(td, cpu)) continue; if (best == NOCPU) best = cpu; else if (runq_length[cpu] < runq_length[best]) best = cpu; } KASSERT(best != NOCPU, ("no valid CPUs")); return (best); } #endif void sched_add(struct thread *td, int flags) #ifdef SMP { cpuset_t tidlemsk; struct td_sched *ts; u_int cpu, cpuid; int forwarded = 0; int single_cpu = 0; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); thread_lock_set(td, &sched_lock); } TD_SET_RUNQ(td); /* * If SMP is started and the thread is pinned or otherwise limited to * a specific set of CPUs, queue the thread to a per-CPU run queue. * Otherwise, queue the thread to the global run queue. * * If SMP has not yet been started we must use the global run queue * as per-CPU state may not be initialized yet and we may crash if we * try to access the per-CPU run queues. */ if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND || ts->ts_flags & TSF_AFFINITY)) { if (td->td_pinned != 0) cpu = td->td_lastcpu; else if (td->td_flags & TDF_BOUND) { /* Find CPU from bound runq. */ KASSERT(SKE_RUNQ_PCPU(ts), ("sched_add: bound td_sched not on cpu runq")); cpu = ts->ts_runq - &runq_pcpu[0]; } else /* Find a valid CPU for our cpuset */ cpu = sched_pickcpu(td); ts->ts_runq = &runq_pcpu[cpu]; single_cpu = 1; CTR3(KTR_RUNQ, "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); } else { CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td); cpu = NOCPU; ts->ts_runq = &runq; } if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_add(); runq_add(ts->ts_runq, td, flags); if (cpu != NOCPU) runq_length[cpu]++; cpuid = PCPU_GET(cpuid); if (single_cpu && cpu != cpuid) { kick_other_cpu(td->td_priority, cpu); } else { if (!single_cpu) { tidlemsk = idle_cpus_mask; CPU_NAND(&tidlemsk, &hlt_cpus_mask); CPU_CLR(cpuid, &tidlemsk); if (!CPU_ISSET(cpuid, &idle_cpus_mask) && ((flags & SRQ_INTR) == 0) && !CPU_EMPTY(&tidlemsk)) forwarded = forward_wakeup(cpu); } if (!forwarded) { if (!maybe_preempt(td)) maybe_resched(td); } } } #else /* SMP */ { struct td_sched *ts; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); thread_lock_set(td, &sched_lock); } TD_SET_RUNQ(td); CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); ts->ts_runq = &runq; if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_add(); runq_add(ts->ts_runq, td, flags); if (!maybe_preempt(td)) maybe_resched(td); } #endif /* SMP */ void sched_rem(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); KASSERT(td->td_flags & TDF_INMEM, ("sched_rem: thread swapped out")); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); mtx_assert(&sched_lock, MA_OWNED); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); #ifdef SMP if (ts->ts_runq != &runq) runq_length[ts->ts_runq - runq_pcpu]--; #endif runq_remove(ts->ts_runq, td); TD_SET_CAN_RUN(td); } /* * Select threads to run. Note that running threads still consume a * slot. */ struct thread * sched_choose(void) { struct thread *td; struct runq *rq; mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP struct thread *tdcpu; rq = &runq; td = runq_choose_fuzz(&runq, runq_fuzz); tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); if (td == NULL || (tdcpu != NULL && tdcpu->td_priority < td->td_priority)) { CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu, PCPU_GET(cpuid)); td = tdcpu; rq = &runq_pcpu[PCPU_GET(cpuid)]; } else { CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td); } #else rq = &runq; td = runq_choose(&runq); #endif if (td) { #ifdef SMP if (td == tdcpu) runq_length[PCPU_GET(cpuid)]--; #endif runq_remove(rq, td); td->td_flags |= TDF_DIDRUN; KASSERT(td->td_flags & TDF_INMEM, ("sched_choose: thread swapped out")); return (td); } return (PCPU_GET(idlethread)); } void sched_preempt(struct thread *td) { SDT_PROBE2(sched, , , surrender, td, td->td_proc); thread_lock(td); if (td->td_critnest > 1) td->td_owepreempt = 1; else mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL); thread_unlock(td); } void sched_userret_slowpath(struct thread *td) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; thread_unlock(td); } void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td_get_sched(td); td->td_flags |= TDF_BOUND; #ifdef SMP ts->ts_runq = &runq_pcpu[cpu]; if (PCPU_GET(cpuid) == cpu) return; mi_switch(SW_VOL, NULL); #endif } void sched_unbind(struct thread* td) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); td->td_flags &= ~TDF_BOUND; } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td->td_flags & TDF_BOUND); } void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); } int sched_load(void) { return (sched_tdcnt); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } fixpt_t sched_pctcpu(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); return (ts->ts_pctcpu); } #ifdef RACCT /* * Calculates the contribution to the thread cpu usage for the latest * (unfinished) second. */ fixpt_t sched_pctcpu_delta(struct thread *td) { struct td_sched *ts; fixpt_t delta; int realstathz; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); delta = 0; realstathz = stathz ? stathz : hz; if (ts->ts_cpticks != 0) { #if (FSHIFT >= CCPU_SHIFT) delta = (realstathz == 100) ? ((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else delta = ((FSCALE - ccpu) * (ts->ts_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif } return (delta); } #endif u_int sched_estcpu(struct thread *td) { return (td_get_sched(td)->ts_estcpu); } /* * The actual idle process. */ void sched_idletd(void *dummy) { struct pcpuidlestat *stat; THREAD_NO_SLEEPING(); stat = DPCPU_PTR(idlestat); for (;;) { mtx_assert(&Giant, MA_NOTOWNED); while (sched_runnable() == 0) { cpu_idle(stat->idlecalls + stat->oldidlecalls > 64); stat->idlecalls++; } mtx_lock_spin(&sched_lock); mi_switch(SW_VOL | SWT_IDLE, NULL); mtx_unlock_spin(&sched_lock); } } /* * A CPU is entering for the first time or a thread is exiting. */ void sched_throw(struct thread *td) { /* * Correct spinlock nesting. The idle thread context that we are * borrowing was created so that it would start out with a single * spin lock (sched_lock) held in fork_trampoline(). Since we've * explicitly acquired locks in this function, the nesting count * is now 2 rather than 1. Since we are nested, calling * spinlock_exit() will simply adjust the counts without allowing * spin lock using code to interrupt us. */ if (td == NULL) { mtx_lock_spin(&sched_lock); spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } else { lock_profile_release_lock(&sched_lock.lock_object); MPASS(td->td_lock == &sched_lock); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; } mtx_assert(&sched_lock, MA_OWNED); KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); cpu_throw(td, choosethread()); /* doesn't return */ } void sched_fork_exit(struct thread *td) { /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with sched_lock held but not recursed. */ td->td_oncpu = PCPU_GET(cpuid); sched_lock.mtx_lock = (uintptr_t)td; lock_profile_obtain_lock_success(&sched_lock.lock_object, 0, 0, __FILE__, __LINE__); THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); SDT_PROBE0(sched, , , on__cpu); } char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td_get_sched(td); if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); ts->ts_name[0] = '\0'; } #endif void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; int cpu; THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Set the TSF_AFFINITY flag if there is at least one CPU this * thread can't run on. */ ts = td_get_sched(td); ts->ts_flags &= ~TSF_AFFINITY; CPU_FOREACH(cpu) { if (!THREAD_CAN_SCHED(td, cpu)) { ts->ts_flags |= TSF_AFFINITY; break; } } /* * If this thread can run on all CPUs, nothing else to do. */ if (!(ts->ts_flags & TSF_AFFINITY)) return; /* Pinned threads and bound threads should be left alone. */ if (td->td_pinned != 0 || td->td_flags & TDF_BOUND) return; switch (td->td_state) { case TDS_RUNQ: /* * If we are on a per-CPU runqueue that is in the set, * then nothing needs to be done. */ if (ts->ts_runq != &runq && THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu)) return; /* Put this thread on a valid per-CPU runqueue. */ sched_rem(td); sched_add(td, SRQ_BORING); break; case TDS_RUNNING: /* * See if our current CPU is in the set. If not, force a * context switch. */ if (THREAD_CAN_SCHED(td, td->td_oncpu)) return; td->td_flags |= TDF_NEEDRESCHED; if (td != curthread) ipi_cpu(cpu, IPI_AST); break; default: break; } #endif } Index: stable/12/sys/kern/sched_ule.c =================================================================== --- stable/12/sys/kern/sched_ule.c (revision 355609) +++ stable/12/sys/kern/sched_ule.c (revision 355610) @@ -1,3106 +1,3106 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2007, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file implements the ULE scheduler. ULE supports independent CPU * run queues and fine grain locking. It has superior interactive * performance under load even on uni-processor systems. * * etymology: * ULE is the last three letters in schedule. It owes its name to a * generic user created for a scheduling system by Paul Mikesell at * Isilon Systems and a general lack of creativity on the part of the author. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include -int dtrace_vtime_active; +int __read_mostly dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif #include #include #define KTR_ULE 0 #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) #define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU))) #define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load")) /* * Thread scheduler specific section. All fields are protected * by the thread lock. */ struct td_sched { struct runq *ts_runq; /* Run-queue we're queued on. */ short ts_flags; /* TSF_* flags. */ int ts_cpu; /* CPU that we have affinity for. */ int ts_rltick; /* Real last tick, for affinity. */ int ts_slice; /* Ticks of slice remaining. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ int ts_ticks; /* Tick count */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size"); /* * Priority ranges used for interactive and non-interactive timeshare * threads. The timeshare priorities are split up into four ranges. * The first range handles interactive threads. The last three ranges * (NHALF, x, and NHALF) handle non-interactive threads with the outer * ranges supporting nice values. */ #define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2) #define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE) #define PRI_MIN_INTERACT PRI_MIN_TIMESHARE #define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1) #define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE) #define PRI_MAX_BATCH PRI_MAX_TIMESHARE /* * Cpu percentage computation macros and defines. * * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. * SCHED_TICK_MAX: Maximum number of ticks before scaling back. * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. */ #define SCHED_TICK_SECS 10 #define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) #define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) #define SCHED_TICK_SHIFT 10 #define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) #define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) /* * These macros determine priorities for non-interactive threads. They are * assigned a priority based on their recent cpu utilization as expressed * by the ratio of ticks to the tick total. NHALF priorities at the start * and end of the MIN to MAX timeshare range are only reachable with negative * or positive nice respectively. * * PRI_RANGE: Priority range for utilization dependent priorities. * PRI_NRESV: Number of nice values. * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. * PRI_NICE: Determines the part of the priority inherited from nice. */ #define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) #define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF) #define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF) #define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) #define SCHED_PRI_TICKS(ts) \ (SCHED_TICK_HZ((ts)) / \ (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) #define SCHED_PRI_NICE(nice) (nice) /* * These determine the interactivity of a process. Interactivity differs from * cpu utilization in that it expresses the voluntary time slept vs time ran * while cpu utilization includes all time not running. This more accurately * models the intent of the thread. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters determine the slice behavior for batch work. */ #define SCHED_SLICE_DEFAULT_DIVISOR 10 /* ~94 ms, 12 stathz ticks. */ #define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */ /* Flags kept in td_flags. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* * tickincr: Converts a stathz tick into a hz domain scaled by * the shift factor. Without the shift the error rate * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. * sched_slice: Runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. */ -static int sched_interact = SCHED_INTERACT_THRESH; -static int tickincr = 8 << SCHED_TICK_SHIFT; -static int realstathz = 127; /* reset during boot. */ -static int sched_slice = 10; /* reset during boot. */ -static int sched_slice_min = 1; /* reset during boot. */ +static int __read_mostly sched_interact = SCHED_INTERACT_THRESH; +static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT; +static int __read_mostly realstathz = 127; /* reset during boot. */ +static int __read_mostly sched_slice = 10; /* reset during boot. */ +static int __read_mostly sched_slice_min = 1; /* reset during boot. */ #ifdef PREEMPTION #ifdef FULL_PREEMPTION -static int preempt_thresh = PRI_MAX_IDLE; +static int __read_mostly preempt_thresh = PRI_MAX_IDLE; #else -static int preempt_thresh = PRI_MIN_KERN; +static int __read_mostly preempt_thresh = PRI_MIN_KERN; #endif #else -static int preempt_thresh = 0; +static int __read_mostly preempt_thresh = 0; #endif -static int static_boost = PRI_MIN_BATCH; -static int sched_idlespins = 10000; -static int sched_idlespinthresh = -1; +static int __read_mostly static_boost = PRI_MIN_BATCH; +static int __read_mostly sched_idlespins = 10000; +static int __read_mostly sched_idlespinthresh = -1; /* * tdq - per processor runqs and statistics. All fields are protected by the * tdq_lock. The load and lowpri may be accessed without to avoid excess * locking in sched_pickcpu(); */ struct tdq { /* * Ordered to improve efficiency of cpu_search() and switch(). * tdq_lock is padded to avoid false sharing with tdq_load and * tdq_cpu_idle. */ struct mtx_padalign tdq_lock; /* run queue lock. */ struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ volatile int tdq_load; /* Aggregate load. */ volatile int tdq_cpu_idle; /* cpu_idle() is active. */ int tdq_sysload; /* For loadavg, !ITHD load. */ volatile int tdq_transferable; /* Transferable thread count. */ volatile short tdq_switchcnt; /* Switches this tick. */ volatile short tdq_oldswitchcnt; /* Switches last tick. */ u_char tdq_lowpri; /* Lowest priority thread. */ u_char tdq_ipipending; /* IPI pending. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ int tdq_id; /* cpuid. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ char tdq_name[TDQ_NAME_LEN]; #ifdef KTR char tdq_loadname[TDQ_LOADNAME_LEN]; #endif } __aligned(64); /* Idle thread states and config. */ #define TDQ_RUNNING 1 #define TDQ_IDLE 2 #ifdef SMP -struct cpu_group *cpu_top; /* CPU topology */ +struct cpu_group __read_mostly *cpu_top; /* CPU topology */ #define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) #define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. */ static int rebalance = 1; static int balance_interval = 128; /* Default set in sched_initticks(). */ -static int affinity; -static int steal_idle = 1; -static int steal_thresh = 2; -static int always_steal = 0; -static int trysteal_limit = 2; +static int __read_mostly affinity; +static int __read_mostly steal_idle = 1; +static int __read_mostly steal_thresh = 2; +static int __read_mostly always_steal = 0; +static int __read_mostly trysteal_limit = 2; /* * One thread queue per processor. */ -static struct tdq *balance_tdq; +static struct tdq __read_mostly *balance_tdq; static int balance_ticks; DPCPU_DEFINE_STATIC(struct tdq, tdq); DPCPU_DEFINE_STATIC(uint32_t, randomval); #define TDQ_SELF() (DPCPU_PTR(tdq)) #define TDQ_CPU(x) (DPCPU_ID_PTR((x), tdq)) #define TDQ_ID(x) ((x)->tdq_id) #else /* !SMP */ static struct tdq tdq_cpu; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) #define TDQ_CPU(x) (&tdq_cpu) #endif #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock)) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); static int sched_interact_score(struct thread *); static void sched_interact_update(struct thread *); static void sched_interact_fork(struct thread *); static void sched_pctcpu_update(struct td_sched *, int); /* Operations on per processor queues */ static struct thread *tdq_choose(struct tdq *); static void tdq_setup(struct tdq *, int i); static void tdq_load_add(struct tdq *, struct thread *); static void tdq_load_rem(struct tdq *, struct thread *); static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); static inline int sched_shouldpreempt(int, int, int); void tdq_print(int cpu); static void runq_print(struct runq *rq); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static struct thread *tdq_move(struct tdq *, struct tdq *); static int tdq_idled(struct tdq *); static void tdq_notify(struct tdq *, struct thread *); static struct thread *tdq_steal(struct tdq *, int); static struct thread *runq_steal(struct runq *, int); static int sched_pickcpu(struct thread *, int); static void sched_balance(void); static int sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct thread *, int, int); static inline void thread_unblock_switch(struct thread *, struct mtx *); static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS); static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent); #endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); /* * Print the threads waiting on a run-queue. */ static void runq_print(struct runq *rq) { struct rqhead *rqh; struct thread *td; int pri; int j; int i; for (i = 0; i < RQB_LEN; i++) { printf("\t\trunq bits %d 0x%zx\n", i, rq->rq_status.rqb_bits[i]); for (j = 0; j < RQB_BPW; j++) if (rq->rq_status.rqb_bits[i] & (1ul << j)) { pri = j + (i << RQB_L2BPW); rqh = &rq->rq_queues[pri]; TAILQ_FOREACH(td, rqh, td_runq) { printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", td, td->td_name, td->td_priority, td->td_rqindex, pri); } } } } /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ void tdq_print(int cpu) { struct tdq *tdq; tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt); printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); printf("\trealtime runq:\n"); runq_print(&tdq->tdq_realtime); printf("\ttimeshare runq:\n"); runq_print(&tdq->tdq_timeshare); printf("\tidle runq:\n"); runq_print(&tdq->tdq_idle); } static inline int sched_shouldpreempt(int pri, int cpri, int remote) { /* * If the new priority is not better than the current priority there is * nothing to do. */ if (pri >= cpri) return (0); /* * Always preempt idle. */ if (cpri >= PRI_MIN_IDLE) return (1); /* * If preemption is disabled don't preempt others. */ if (preempt_thresh == 0) return (0); /* * Preempt if we exceed the threshold. */ if (pri <= preempt_thresh) return (1); /* * If we're interactive or better and there is non-interactive * or worse running preempt only remote processors. */ if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) return (1); return (0); } /* * Add a thread to the actual run-queue. Keeps transferable counts up to * date with what is actually on the run-queue. Selects the correct * queue position for timeshare threads. */ static __inline void tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) { struct td_sched *ts; u_char pri; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td_get_sched(td); TD_SET_RUNQ(td); if (THREAD_CAN_MIGRATE(td)) { tdq->tdq_transferable++; ts->ts_flags |= TSF_XFERABLE; } if (pri < PRI_MIN_BATCH) { ts->ts_runq = &tdq->tdq_realtime; } else if (pri <= PRI_MAX_BATCH) { ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH, ("Invalid priority %d on timeshare runq", pri)); /* * This queue contains only priorities between MIN and MAX * realtime. Use the whole queue to represent these values. */ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE; pri = (pri + tdq->tdq_idx) % RQ_NQS; /* * This effectively shortens the queue by one so we * can have a one slot difference between idx and * ridx while we wait for threads to drain. */ if (tdq->tdq_ridx != tdq->tdq_idx && pri == tdq->tdq_ridx) pri = (unsigned char)(pri - 1) % RQ_NQS; } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, td, pri, flags); return; } else ts->ts_runq = &tdq->tdq_idle; runq_add(ts->ts_runq, td, flags); } /* * Remove a thread from a run-queue. This typically happens when a thread * is selected to run. Running threads are not on the queue and the * transferable count does not reflect them. */ static __inline void tdq_runq_rem(struct tdq *tdq, struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(ts->ts_runq != NULL, ("tdq_runq_remove: thread %p null ts_runq", td)); if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } if (ts->ts_runq == &tdq->tdq_timeshare) { if (tdq->tdq_idx != tdq->tdq_ridx) runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, td, NULL); } else runq_remove(ts->ts_runq, td); } /* * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load * for this thread to the referenced thread queue. */ static void tdq_load_add(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); tdq->tdq_load++; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload++; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Remove the load from a thread that is transitioning to a sleep state or * exiting. */ static void tdq_load_rem(struct tdq *tdq, struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload--; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Bound timeshare latency by decreasing slice size as load increases. We * consider the maximum latency as the sum of the threads waiting to run * aside from curthread and target no more than sched_slice latency but * no less than sched_slice_min runtime. */ static inline int tdq_slice(struct tdq *tdq) { int load; /* * It is safe to use sys_load here because this is called from * contexts where timeshare threads are running and so there * cannot be higher priority load in the system. */ load = tdq->tdq_sysload - 1; if (load >= SCHED_SLICE_MIN_DIVISOR) return (sched_slice_min); if (load <= 1) return (sched_slice); return (sched_slice / load); } /* * Set lowpri to its exact value by searching the run-queue and * evaluating curthread. curthread may be passed as an optimization. */ static void tdq_setlowpri(struct tdq *tdq, struct thread *ctd) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (ctd == NULL) ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; td = tdq_choose(tdq); if (td == NULL || td->td_priority > ctd->td_priority) tdq->tdq_lowpri = ctd->td_priority; else tdq->tdq_lowpri = td->td_priority; } #ifdef SMP /* * We need some randomness. Implement a classic Linear Congruential * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits * of the random state (in the low bits of our answer) to keep * the maximum randomness. */ static uint32_t sched_random(void) { uint32_t *rndptr; rndptr = DPCPU_PTR(randomval); *rndptr = *rndptr * 69069 + 5; return (*rndptr >> 16); } struct cpu_search { cpuset_t cs_mask; u_int cs_prefer; int cs_pri; /* Min priority for low. */ int cs_limit; /* Max load for low, min load for high. */ int cs_cpu; int cs_load; }; #define CPU_SEARCH_LOWEST 0x1 #define CPU_SEARCH_HIGHEST 0x2 #define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) static __always_inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match); int __noinline cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low); int __noinline cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high); int __noinline cpu_search_both(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high); /* * Search the tree of cpu_groups for the lowest or highest loaded cpu * according to the match argument. This routine actually compares the * load on all paths through the tree and finds the least loaded cpu on * the least loaded path, which may differ from the least loaded cpu in * the system. This balances work among caches and buses. * * This inline is instantiated in three forms below using constants for the * match argument. It is reduced to the minimum set for each case. It is * also recursive to the depth of the tree. */ static __always_inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match) { struct cpu_search lgroup; struct cpu_search hgroup; cpuset_t cpumask; struct cpu_group *child; struct tdq *tdq; int cpu, i, hload, lload, load, total, rnd; total = 0; cpumask = cg->cg_mask; if (match & CPU_SEARCH_LOWEST) { lload = INT_MAX; lgroup = *low; } if (match & CPU_SEARCH_HIGHEST) { hload = INT_MIN; hgroup = *high; } /* Iterate through the child CPU groups and then remaining CPUs. */ for (i = cg->cg_children, cpu = mp_maxid; ; ) { if (i == 0) { #ifdef HAVE_INLINE_FFSL cpu = CPU_FFS(&cpumask) - 1; #else while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask)) cpu--; #endif if (cpu < 0) break; child = NULL; } else child = &cg->cg_child[i - 1]; if (match & CPU_SEARCH_LOWEST) lgroup.cs_cpu = -1; if (match & CPU_SEARCH_HIGHEST) hgroup.cs_cpu = -1; if (child) { /* Handle child CPU group. */ CPU_NAND(&cpumask, &child->cg_mask); switch (match) { case CPU_SEARCH_LOWEST: load = cpu_search_lowest(child, &lgroup); break; case CPU_SEARCH_HIGHEST: load = cpu_search_highest(child, &hgroup); break; case CPU_SEARCH_BOTH: load = cpu_search_both(child, &lgroup, &hgroup); break; } } else { /* Handle child CPU. */ CPU_CLR(cpu, &cpumask); tdq = TDQ_CPU(cpu); load = tdq->tdq_load * 256; rnd = sched_random() % 32; if (match & CPU_SEARCH_LOWEST) { if (cpu == low->cs_prefer) load -= 64; /* If that CPU is allowed and get data. */ if (tdq->tdq_lowpri > lgroup.cs_pri && tdq->tdq_load <= lgroup.cs_limit && CPU_ISSET(cpu, &lgroup.cs_mask)) { lgroup.cs_cpu = cpu; lgroup.cs_load = load - rnd; } } if (match & CPU_SEARCH_HIGHEST) if (tdq->tdq_load >= hgroup.cs_limit && tdq->tdq_transferable && CPU_ISSET(cpu, &hgroup.cs_mask)) { hgroup.cs_cpu = cpu; hgroup.cs_load = load - rnd; } } total += load; /* We have info about child item. Compare it. */ if (match & CPU_SEARCH_LOWEST) { if (lgroup.cs_cpu >= 0 && (load < lload || (load == lload && lgroup.cs_load < low->cs_load))) { lload = load; low->cs_cpu = lgroup.cs_cpu; low->cs_load = lgroup.cs_load; } } if (match & CPU_SEARCH_HIGHEST) if (hgroup.cs_cpu >= 0 && (load > hload || (load == hload && hgroup.cs_load > high->cs_load))) { hload = load; high->cs_cpu = hgroup.cs_cpu; high->cs_load = hgroup.cs_load; } if (child) { i--; if (i == 0 && CPU_EMPTY(&cpumask)) break; } #ifndef HAVE_INLINE_FFSL else cpu--; #endif } return (total); } /* * cpu_search instantiations must pass constants to maintain the inline * optimization. */ int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low) { return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); } int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high) { return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); } int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high) { return cpu_search(cg, low, high, CPU_SEARCH_BOTH); } /* * Find the cpu with the least load via the least loaded path that has a * lowpri greater than pri pri. A pri of -1 indicates any priority is * acceptable. */ static inline int sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload, int prefer) { struct cpu_search low; low.cs_cpu = -1; low.cs_prefer = prefer; low.cs_mask = mask; low.cs_pri = pri; low.cs_limit = maxload; cpu_search_lowest(cg, &low); return low.cs_cpu; } /* * Find the cpu with the highest load via the highest loaded path. */ static inline int sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload) { struct cpu_search high; high.cs_cpu = -1; high.cs_mask = mask; high.cs_limit = minload; cpu_search_highest(cg, &high); return high.cs_cpu; } static void sched_balance_group(struct cpu_group *cg) { struct tdq *tdq; cpuset_t hmask, lmask; int high, low, anylow; CPU_FILL(&hmask); for (;;) { high = sched_highest(cg, hmask, 2); /* Stop if there is no more CPU with transferrable threads. */ if (high == -1) break; CPU_CLR(high, &hmask); CPU_COPY(&hmask, &lmask); /* Stop if there is no more CPU left for low. */ if (CPU_EMPTY(&lmask)) break; anylow = 1; tdq = TDQ_CPU(high); nextlow: low = sched_lowest(cg, lmask, -1, tdq->tdq_load - 1, high); /* Stop if we looked well and found no less loaded CPU. */ if (anylow && low == -1) break; /* Go to next high if we found no less loaded CPU. */ if (low == -1) continue; /* Transfer thread from high to low. */ if (sched_balance_pair(tdq, TDQ_CPU(low))) { /* CPU that got thread can no longer be a donor. */ CPU_CLR(low, &hmask); } else { /* * If failed, then there is no threads on high * that can run on this low. Drop low from low * mask and look for different one. */ CPU_CLR(low, &lmask); anylow = 0; goto nextlow; } } } static void sched_balance(void) { struct tdq *tdq; balance_ticks = max(balance_interval / 2, 1) + (sched_random() % balance_interval); tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* * Lock two thread queues using their address to maintain lock order. */ static void tdq_lock_pair(struct tdq *one, struct tdq *two) { if (one < two) { TDQ_LOCK(one); TDQ_LOCK_FLAGS(two, MTX_DUPOK); } else { TDQ_LOCK(two); TDQ_LOCK_FLAGS(one, MTX_DUPOK); } } /* * Unlock two thread queues. Order is not important here. */ static void tdq_unlock_pair(struct tdq *one, struct tdq *two) { TDQ_UNLOCK(one); TDQ_UNLOCK(two); } /* * Transfer load between two imbalanced thread queues. */ static int sched_balance_pair(struct tdq *high, struct tdq *low) { struct thread *td; int cpu; tdq_lock_pair(high, low); td = NULL; /* * Transfer a thread from high to low. */ if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load && (td = tdq_move(high, low)) != NULL) { /* * In case the target isn't the current cpu notify it of the * new load, possibly sending an IPI to force it to reschedule. */ cpu = TDQ_ID(low); if (cpu != PCPU_GET(cpuid)) tdq_notify(low, td); } tdq_unlock_pair(high, low); return (td != NULL); } /* * Move a thread from one thread queue to another. */ static struct thread * tdq_move(struct tdq *from, struct tdq *to) { struct td_sched *ts; struct thread *td; struct tdq *tdq; int cpu; TDQ_LOCK_ASSERT(from, MA_OWNED); TDQ_LOCK_ASSERT(to, MA_OWNED); tdq = from; cpu = TDQ_ID(to); td = tdq_steal(tdq, cpu); if (td == NULL) return (NULL); ts = td_get_sched(td); /* * Although the run queue is locked the thread may be blocked. Lock * it to clear this and acquire the run-queue lock. */ thread_lock(td); /* Drop recursive lock on from acquired via thread_lock(). */ TDQ_UNLOCK(from); sched_rem(td); ts->ts_cpu = cpu; td->td_lock = TDQ_LOCKPTR(to); tdq_add(to, td, SRQ_YIELDING); return (td); } /* * This tdq has idled. Try to steal a thread from another cpu and switch * to it. */ static int tdq_idled(struct tdq *tdq) { struct cpu_group *cg; struct tdq *steal; cpuset_t mask; int cpu, switchcnt; if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL) return (1); CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); restart: switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; for (cg = tdq->tdq_cg; ; ) { cpu = sched_highest(cg, mask, steal_thresh); /* * We were assigned a thread but not preempted. Returning * 0 here will cause our caller to switch to it. */ if (tdq->tdq_load) return (0); if (cpu == -1) { cg = cg->cg_parent; if (cg == NULL) return (1); continue; } steal = TDQ_CPU(cpu); /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. * * Testing this ahead of tdq_lock_pair() only catches * this situation about 20% of the time on an 8 core * 16 thread Ryzen 7, but it still helps performance. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) goto restart; tdq_lock_pair(tdq, steal); /* * We were assigned a thread while waiting for the locks. * Switch to it now instead of stealing a thread. */ if (tdq->tdq_load) break; /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread, or * we were preempted and the CPU loading info may be out * of date. The latter is rare. In either case restart * the search. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0 || switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) { tdq_unlock_pair(tdq, steal); goto restart; } /* * Steal the thread and switch to it. */ if (tdq_move(steal, tdq) != NULL) break; /* * We failed to acquire a thread even though it looked * like one was available. This could be due to affinity * restrictions or for other reasons. Loop again after * removing this CPU from the set. The restart logic * above does not restore this CPU to the set due to the * likelyhood of failing here again. */ CPU_CLR(cpu, &mask); tdq_unlock_pair(tdq, steal); } TDQ_UNLOCK(steal); mi_switch(SW_VOL | SWT_IDLE, NULL); thread_unlock(curthread); return (0); } /* * Notify a remote cpu of new work. Sends an IPI if criteria are met. */ static void tdq_notify(struct tdq *tdq, struct thread *td) { struct thread *ctd; int pri; int cpu; if (tdq->tdq_ipipending) return; cpu = td_get_sched(td)->ts_cpu; pri = td->td_priority; ctd = pcpu_find(cpu)->pc_curthread; if (!sched_shouldpreempt(pri, ctd->td_priority, 1)) return; /* * Make sure that our caller's earlier update to tdq_load is * globally visible before we read tdq_cpu_idle. Idle thread * accesses both of them without locks, and the order is important. */ atomic_thread_fence_seq_cst(); if (TD_IS_IDLETHREAD(ctd)) { /* * If the MD code has an idle wakeup routine try that before * falling back to IPI. */ if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu)) return; } tdq->tdq_ipipending = 1; ipi_cpu(cpu, IPI_PREEMPT); } /* * Steals load from a timeshare queue. Honors the rotating queue head * index. */ static struct thread * runq_steal_from(struct runq *rq, int cpu, u_char start) { struct rqbits *rqb; struct rqhead *rqh; struct thread *td, *first; int bit; int i; rqb = &rq->rq_status; bit = start & (RQB_BPW -1); first = NULL; again: for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { if (rqb->rqb_bits[i] == 0) continue; if (bit == 0) bit = RQB_FFS(rqb->rqb_bits[i]); for (; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[i] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) { if (first && THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); first = td; } } } if (start != 0) { start = 0; goto again; } if (first && THREAD_CAN_MIGRATE(first) && THREAD_CAN_SCHED(first, cpu)) return (first); return (NULL); } /* * Steals load from a standard linear queue. */ static struct thread * runq_steal(struct runq *rq, int cpu) { struct rqhead *rqh; struct rqbits *rqb; struct thread *td; int word; int bit; rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) continue; for (bit = 0; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } } return (NULL); } /* * Attempt to steal a thread in priority order from a thread queue. */ static struct thread * tdq_steal(struct tdq *tdq, int cpu) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) return (td); if ((td = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) != NULL) return (td); return (runq_steal(&tdq->tdq_idle, cpu)); } /* * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the * current lock and returns with the assigned queue locked. */ static inline struct tdq * sched_setcpu(struct thread *td, int cpu, int flags) { struct tdq *tdq; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_CPU(cpu); td_get_sched(td)->ts_cpu = cpu; /* * If the lock matches just return the queue. */ if (td->td_lock == TDQ_LOCKPTR(tdq)) return (tdq); #ifdef notyet /* * If the thread isn't running its lockptr is a * turnstile or a sleepqueue. We can just lock_set without * blocking. */ if (TD_CAN_RUN(td)) { TDQ_LOCK(tdq); thread_lock_set(td, TDQ_LOCKPTR(tdq)); return (tdq); } #endif /* * The hard case, migration, we need to block the thread first to * prevent order reversals with other cpus locks. */ spinlock_enter(); thread_lock_block(td); TDQ_LOCK(tdq); thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); spinlock_exit(); return (tdq); } SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding"); SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load"); SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu"); SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); static int sched_pickcpu(struct thread *td, int flags) { struct cpu_group *cg, *ccg; struct td_sched *ts; struct tdq *tdq; cpuset_t mask; int cpu, pri, self, intr; self = PCPU_GET(cpuid); ts = td_get_sched(td); KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on " "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name)); if (smp_started == 0) return (self); /* * Don't migrate a running thread from sched_switch(). */ if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) return (ts->ts_cpu); /* * Prefer to run interrupt threads on the processors that generate * the interrupt. */ if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && curthread->td_intr_nesting_level) { tdq = TDQ_SELF(); if (tdq->tdq_lowpri >= PRI_MIN_IDLE) { SCHED_STAT_INC(pickcpu_idle_affinity); return (self); } ts->ts_cpu = self; intr = 1; cg = tdq->tdq_cg; goto llc; } else { intr = 0; tdq = TDQ_CPU(ts->ts_cpu); cg = tdq->tdq_cg; } /* * If the thread can run on the last cpu and the affinity has not * expired and it is idle, run it there. */ if (THREAD_CAN_SCHED(td, ts->ts_cpu) && tdq->tdq_lowpri >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { if (cg->cg_flags & CG_FLAG_THREAD) { /* Check all SMT threads for being idle. */ for (cpu = CPU_FFS(&cg->cg_mask) - 1; ; cpu++) { if (CPU_ISSET(cpu, &cg->cg_mask) && TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) break; if (cpu >= mp_maxid) { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } } else { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } llc: /* * Search for the last level cache CPU group in the tree. * Skip SMT, identical groups and caches with expired affinity. * Interrupt threads affinity is explicit and never expires. */ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { if (cg->cg_flags & CG_FLAG_THREAD) continue; if (cg->cg_children == 1 || cg->cg_count == 1) continue; if (cg->cg_level == CG_SHARE_NONE || (!intr && !SCHED_AFFINITY(ts, cg->cg_level))) continue; ccg = cg; } /* Found LLC shared by all CPUs, so do a global search. */ if (ccg == cpu_top) ccg = NULL; cpu = -1; mask = td->td_cpuset->cs_mask; pri = td->td_priority; /* * Try hard to keep interrupts within found LLC. Search the LLC for * the least loaded CPU we can run now. For NUMA systems it should * be within target domain, and it also reduces scheduling overhead. */ if (ccg != NULL && intr) { cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu); if (cpu >= 0) SCHED_STAT_INC(pickcpu_intrbind); } else /* Search the LLC for the least loaded idle CPU we can run now. */ if (ccg != NULL) { cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE), INT_MAX, ts->ts_cpu); if (cpu >= 0) SCHED_STAT_INC(pickcpu_affinity); } /* Search globally for the least loaded CPU we can run now. */ if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } /* Search globally for the least loaded CPU. */ if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu.")); KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu)); /* * Compare the lowest loaded cpu to current cpu. */ tdq = TDQ_CPU(cpu); if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri && tdq->tdq_lowpri < PRI_MIN_IDLE && TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; } if (cpu != ts->ts_cpu) SCHED_STAT_INC(pickcpu_migration); return (cpu); } #endif /* * Pick the highest priority task we have and return it. */ static struct thread * tdq_choose(struct tdq *tdq) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = runq_choose(&tdq->tdq_realtime); if (td != NULL) return (td); td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_BATCH, ("tdq_choose: Invalid priority on timeshare queue %d", td->td_priority)); return (td); } td = runq_choose(&tdq->tdq_idle); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_IDLE, ("tdq_choose: Invalid priority on idle queue %d", td->td_priority)); return (td); } return (NULL); } /* * Initialize a thread queue. */ static void tdq_setup(struct tdq *tdq, int id) { if (bootverbose) printf("ULE: setup cpu %d\n", id); runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); tdq->tdq_id = id; snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), "sched lock %d", (int)TDQ_ID(tdq)); mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN | MTX_RECURSE); #ifdef KTR snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname), "CPU %d load", (int)TDQ_ID(tdq)); #endif } #ifdef SMP static void sched_setup_smp(void) { struct tdq *tdq; int i; cpu_top = smp_topo(); CPU_FOREACH(i) { tdq = DPCPU_ID_PTR(i, tdq); tdq_setup(tdq, i); tdq->tdq_cg = smp_topo_find(cpu_top, i); if (tdq->tdq_cg == NULL) panic("Can't find cpu group for %d\n", i); } balance_tdq = TDQ_SELF(); } #endif /* * Setup the thread queues and initialize the topology based on MD * information. */ static void sched_setup(void *dummy) { struct tdq *tdq; #ifdef SMP sched_setup_smp(); #else tdq_setup(TDQ_SELF(), 0); #endif tdq = TDQ_SELF(); /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); tdq_load_add(tdq, &thread0); tdq->tdq_lowpri = thread0.td_priority; TDQ_UNLOCK(tdq); } /* * This routine determines time constants after stathz and hz are setup. */ /* ARGSUSED */ static void sched_initticks(void *dummy) { int incr; realstathz = stathz ? stathz : hz; sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR; sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); /* * tickincr is shifted out by 10 to avoid rounding errors due to * hz not being evenly divisible by stathz on all platforms. */ incr = (hz << SCHED_TICK_SHIFT) / realstathz; /* * This does not work for values of stathz that are more than * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. */ if (incr == 0) incr = 1; tickincr = incr; #ifdef SMP /* * Set the default balance interval now that we know * what realstathz is. */ balance_interval = realstathz; balance_ticks = balance_interval; affinity = SCHED_AFFINITY_DEFAULT; #endif if (sched_idlespinthresh < 0) sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz; } /* * This is the core of the interactivity algorithm. Determines a score based * on past behavior. It is the ratio of sleep time to run time scaled to * a [0, 100] integer. This is the voluntary sleep time of a process, which * differs from the cpu usage because it does not account for time spent * waiting on a run-queue. Would be prettier if we had floating point. * * When a thread's sleep time is greater than its run time the * calculation is: * * scaling factor * interactivity score = --------------------- * sleep time / run time * * * When a thread's run time is greater than its sleep time the * calculation is: * * scaling factor * interactivity score = --------------------- + scaling factor * run time / sleep time */ static int sched_interact_score(struct thread *td) { struct td_sched *ts; int div; ts = td_get_sched(td); /* * The score is only needed if this is likely to be an interactive * task. Don't go through the expense of computing it if there's * no chance. */ if (sched_interact <= SCHED_INTERACT_HALF && ts->ts_runtime >= ts->ts_slptime) return (SCHED_INTERACT_HALF); if (ts->ts_runtime > ts->ts_slptime) { div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); } if (ts->ts_slptime > ts->ts_runtime) { div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); return (ts->ts_runtime / div); } /* runtime == slptime */ if (ts->ts_runtime) return (SCHED_INTERACT_HALF); /* * This can happen if slptime and runtime are 0. */ return (0); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct thread *td) { int score; int pri; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; /* * If the score is interactive we place the thread in the realtime * queue with a priority that is less than kernel and interrupt * priorities. These threads are not subject to nice restrictions. * * Scores greater than this are placed on the normal timeshare queue * where the priority is partially decided by the most recent cpu * utilization and the rest is decided by nice value. * * The nice value of the process has a linear effect on the calculated * score. Negative nice values make it easier for a thread to be * considered interactive. */ score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_INTERACT; pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) / sched_interact) * score; KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, ("sched_priority: invalid interactive priority %d score %d", pri, score)); } else { pri = SCHED_PRI_MIN; if (td_get_sched(td)->ts_ticks) pri += min(SCHED_PRI_TICKS(td_get_sched(td)), SCHED_PRI_RANGE - 1); pri += SCHED_PRI_NICE(td->td_proc->p_nice); KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, ("sched_priority: invalid priority %d: nice %d, " "ticks %d ftick %d ltick %d tick pri %d", pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks, td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick, SCHED_PRI_TICKS(td_get_sched(td)))); } sched_user_prio(td, pri); return; } /* * This routine enforces a maximum limit on the amount of scheduling history * kept. It is called after either the slptime or runtime is adjusted. This * function is ugly due to integer math. */ static void sched_interact_update(struct thread *td) { struct td_sched *ts; u_int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum < SCHED_SLP_RUN_MAX) return; /* * This only happens from two places: * 1) We have added an unusual amount of run time from fork_exit. * 2) We have added an unusual amount of sleep time from sched_sleep(). */ if (sum > SCHED_SLP_RUN_MAX * 2) { if (ts->ts_runtime > ts->ts_slptime) { ts->ts_runtime = SCHED_SLP_RUN_MAX; ts->ts_slptime = 1; } else { ts->ts_slptime = SCHED_SLP_RUN_MAX; ts->ts_runtime = 1; } return; } /* * If we have exceeded by more than 1/5th then the algorithm below * will not bring us back into range. Dividing by two here forces * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] */ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { ts->ts_runtime /= 2; ts->ts_slptime /= 2; return; } ts->ts_runtime = (ts->ts_runtime / 5) * 4; ts->ts_slptime = (ts->ts_slptime / 5) * 4; } /* * Scale back the interactivity history when a child thread is created. The * history is inherited from the parent but the thread may behave totally * differently. For example, a shell spawning a compiler process. We want * to learn that the compiler is behaving badly very quickly. */ static void sched_interact_fork(struct thread *td) { struct td_sched *ts; int ratio; int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum > SCHED_SLP_RUN_FORK) { ratio = sum / SCHED_SLP_RUN_FORK; ts->ts_runtime /= ratio; ts->ts_slptime /= ratio; } } /* * Called from proc0_init() to setup the scheduler fields. */ void schedinit(void) { struct td_sched *ts0; /* * Set up the scheduler specific parts of thread0. */ ts0 = td_get_sched(&thread0); ts0->ts_ltick = ticks; ts0->ts_ftick = ticks; ts0->ts_slice = 0; ts0->ts_cpu = curcpu; /* set valid CPU number */ } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most sched_slice stathz ticks. */ int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * Update the percent cpu tracking information when it is requested or * the total history exceeds the maximum. We keep a sliding history of * tick counts that slowly decays. This is less precise than the 4BSD * mechanism since it happens with less regular and frequent events. */ static void sched_pctcpu_update(struct td_sched *ts, int run) { int t = ticks; /* * The signed difference may be negative if the thread hasn't run for * over half of the ticks rollover period. */ if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) { ts->ts_ticks = 0; ts->ts_ftick = t - SCHED_TICK_TARG; } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) { ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) * (ts->ts_ltick - (t - SCHED_TICK_TARG)); ts->ts_ftick = t - SCHED_TICK_TARG; } if (run) ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT; ts->ts_ltick = t; } /* * Adjust the priority of a thread. Move it to the appropriate run-queue * if necessary. This is the back-end for several priority related * functions. */ static void sched_thread_priority(struct thread *td, u_char prio) { struct td_sched *ts; struct tdq *tdq; int oldpri; KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio < td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. This could be optimized to not re-add in some * cases. */ if (TD_ON_RUNQ(td) && prio < td->td_priority) { sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING); return; } /* * If the thread is currently running we may have to adjust the lowpri * information so other cpus are aware of our current priority. */ if (TD_IS_RUNNING(td)) { tdq = TDQ_CPU(ts->ts_cpu); oldpri = td->td_priority; td->td_priority = prio; if (prio < tdq->tdq_lowpri) tdq->tdq_lowpri = prio; else if (tdq->tdq_lowpri == oldpri) tdq_setlowpri(tdq, td); return; } td->td_priority = prio; } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_thread_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regular priority is less * important than prio, the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_thread_priority(td, base_pri); } else sched_lend_prio(td, prio); } /* * Standard entry for setting the priority to an absolute value. */ void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't * ever lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_thread_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } /* * Set the base user priority, does not effect current running priority. */ void sched_user_prio(struct thread *td, u_char prio) { td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) td->td_flags |= TDF_NEEDRESCHED; } #ifdef SMP /* * This tdq is about to idle. Try to steal a thread from another CPU before * choosing the idle thread. */ static void tdq_trysteal(struct tdq *tdq) { struct cpu_group *cg; struct tdq *steal; cpuset_t mask; int cpu, i; if (smp_started == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL) return; CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); /* We don't want to be preempted while we're iterating. */ spinlock_enter(); TDQ_UNLOCK(tdq); for (i = 1, cg = tdq->tdq_cg; ; ) { cpu = sched_highest(cg, mask, steal_thresh); /* * If a thread was added while interrupts were disabled don't * steal one here. */ if (tdq->tdq_load > 0) { TDQ_LOCK(tdq); break; } if (cpu == -1) { i++; cg = cg->cg_parent; if (cg == NULL || i > trysteal_limit) { TDQ_LOCK(tdq); break; } continue; } steal = TDQ_CPU(cpu); /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) continue; tdq_lock_pair(tdq, steal); /* * If we get to this point, unconditonally exit the loop * to bound the time spent in the critcal section. * * If a thread was added while interrupts were disabled don't * steal one here. */ if (tdq->tdq_load > 0) { TDQ_UNLOCK(steal); break; } /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) { TDQ_UNLOCK(steal); break; } /* * If we fail to acquire one due to affinity restrictions, * bail out and let the idle thread to a more complete search * outside of a critical section. */ if (tdq_move(steal, tdq) == NULL) { TDQ_UNLOCK(steal); break; } TDQ_UNLOCK(steal); break; } spinlock_exit(); } #endif /* * Handle migration from sched_switch(). This happens only for * cpu binding. */ static struct mtx * sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) { struct tdq *tdn; KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: " "thread %s queued on absent CPU %d.", td->td_name, td_get_sched(td)->ts_cpu)); tdn = TDQ_CPU(td_get_sched(td)->ts_cpu); #ifdef SMP tdq_load_rem(tdq, td); /* * Do the lock dance required to avoid LOR. We grab an extra * spinlock nesting to prevent preemption while we're * not holding either run-queue lock. */ spinlock_enter(); thread_lock_block(td); /* This releases the lock on tdq. */ /* * Acquire both run-queue locks before placing the thread on the new * run-queue to avoid deadlocks created by placing a thread with a * blocked lock on the run-queue of a remote processor. The deadlock * occurs when a third processor attempts to lock the two queues in * question while the target processor is spinning with its own * run-queue lock held while waiting for the blocked lock to clear. */ tdq_lock_pair(tdn, tdq); tdq_add(tdn, td, flags); tdq_notify(tdn, td); TDQ_UNLOCK(tdn); spinlock_exit(); #endif return (TDQ_LOCKPTR(tdn)); } /* * Variadic version of thread_lock_unblock() that does not assume td_lock * is blocked. */ static inline void thread_unblock_switch(struct thread *td, struct mtx *mtx) { atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, (uintptr_t)mtx); } /* * Switch threads. This function has to handle threads coming in while * blocked for some reason, running, or idle. It also must deal with * migrating a thread from one queue to another as running threads may * be assigned elsewhere via binding. */ void sched_switch(struct thread *td, struct thread *newtd, int flags) { struct tdq *tdq; struct td_sched *ts; struct mtx *mtx; int srqflag; int cpuid, preempted; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument")); cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); ts = td_get_sched(td); mtx = td->td_lock; sched_pctcpu_update(ts, 1); ts->ts_rltick = ticks; td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND); td->td_owepreempt = 0; if (!TD_IS_IDLETHREAD(td)) tdq->tdq_switchcnt++; /* * The lock pointer in an idle thread should never change. Reset it * to CAN_RUN as well. */ if (TD_IS_IDLETHREAD(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); srqflag = preempted ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; #ifdef SMP if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu)) ts->ts_cpu = sched_pickcpu(td, 0); #endif if (ts->ts_cpu == cpuid) tdq_runq_add(tdq, td, srqflag); else { KASSERT(THREAD_CAN_MIGRATE(td) || (ts->ts_flags & TSF_BOUND) != 0, ("Thread %p shouldn't migrate", td)); mtx = sched_switch_migrate(tdq, td, srqflag); } } else { /* This thread must be going to sleep. */ TDQ_LOCK(tdq); mtx = thread_lock_block(td); tdq_load_rem(tdq, td); #ifdef SMP if (tdq->tdq_load == 0) tdq_trysteal(tdq); #endif } #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif /* * We enter here with the thread blocked and assigned to the * appropriate cpu run-queue or sleep-queue and with the current * thread-queue locked. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); newtd = choosethread(); /* * Call the MD code to switch contexts if necessary. */ if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; sched_pctcpu_update(td_get_sched(newtd), 0); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif cpu_switch(td, newtd, mtx); /* * We may return from cpu_switch on a different cpu. However, * we always return with td_lock pointing to the current cpu's * run queue lock. */ cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); lock_profile_obtain_lock_success( &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else { thread_unblock_switch(td, mtx); SDT_PROBE0(sched, , , remain__cpu); } KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); /* * Assert that all went well and return. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; } /* * Adjust thread priorities as a result of a nice request. */ void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); sched_priority(td); sched_prio(td, td->td_base_user_pri); thread_unlock(td); } } /* * Record the sleep time for the interactivity scorer. */ void sched_sleep(struct thread *td, int prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; if (TD_IS_SUSPENDED(td) || prio >= PSOCK) td->td_flags |= TDF_CANSWAP; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; if (static_boost == 1 && prio) sched_prio(td, prio); else if (static_boost && td->td_priority > static_boost) sched_prio(td, static_boost); } /* * Schedule a thread to resume execution and record how long it voluntarily * slept. We also update the pctcpu, interactivity, and priority. */ void sched_wakeup(struct thread *td) { struct td_sched *ts; int slptick; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); td->td_flags &= ~TDF_CANSWAP; /* * If we slept for more than a tick update our interactivity and * priority. */ slptick = td->td_slptick; td->td_slptick = 0; if (slptick && slptick != ticks) { ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; sched_interact_update(td); sched_pctcpu_update(ts, 0); } /* * Reset the slice value since we slept and advanced the round-robin. */ ts->ts_slice = 0; sched_add(td, SRQ_BORING); } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct thread *td, struct thread *child) { THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(td_get_sched(td), 1); sched_fork_thread(td, child); /* * Penalize the parent and child for forking. */ sched_interact_fork(child); sched_priority(child); td_get_sched(td)->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Fork a new thread, may be within the same process. */ void sched_fork_thread(struct thread *td, struct thread *child) { struct td_sched *ts; struct td_sched *ts2; struct tdq *tdq; tdq = TDQ_SELF(); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Initialize child. */ ts = td_get_sched(td); ts2 = td_get_sched(child); child->td_oncpu = NOCPU; child->td_lastcpu = NOCPU; child->td_lock = TDQ_LOCKPTR(tdq); child->td_cpuset = cpuset_ref(td->td_cpuset); child->td_domain.dr_policy = td->td_cpuset->cs_domain; ts2->ts_cpu = ts->ts_cpu; ts2->ts_flags = 0; /* * Grab our parents cpu estimation information. */ ts2->ts_ticks = ts->ts_ticks; ts2->ts_ltick = ts->ts_ltick; ts2->ts_ftick = ts->ts_ftick; /* * Do not inherit any borrowed priority from the parent. */ child->td_priority = child->td_base_pri; /* * And update interactivity score. */ ts2->ts_slptime = ts->ts_slptime; ts2->ts_runtime = ts->ts_runtime; /* Attempt to quickly learn interactivity. */ ts2->ts_slice = tdq_slice(tdq) - sched_slice_min; #ifdef KTR bzero(ts2->ts_name, sizeof(ts2->ts_name)); #endif } /* * Adjust the priority class of a thread. */ void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; td->td_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct thread *child) { struct thread *td; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit", "prio:%d", child->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td, child); } /* * Penalize another thread for the time spent on this one. This helps to * worsen the priority and interactivity of processes which schedule batch * jobs such as make. This has little effect on the make process itself but * causes new processes spawned by it to receive worse scores immediately. */ void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit", "prio:%d", child->td_priority); /* * Give the child's runtime to the parent without returning the * sleep time as a penalty to the parent. This causes shells that * launch expensive things to mark their children as expensive. */ thread_lock(td); td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime; sched_interact_update(td); sched_priority(td); thread_unlock(td); } void sched_preempt(struct thread *td) { struct tdq *tdq; SDT_PROBE2(sched, , , surrender, td, td->td_proc); thread_lock(td); tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); tdq->tdq_ipipending = 0; if (td->td_priority > tdq->tdq_lowpri) { int flags; flags = SW_INVOL | SW_PREEMPT; if (td->td_critnest > 1) td->td_owepreempt = 1; else if (TD_IS_IDLETHREAD(td)) mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL); else mi_switch(flags | SWT_REMOTEPREEMPT, NULL); } thread_unlock(td); } /* * Fix priorities on return to user-space. Priorities may be elevated due * to static priorities in msleep() or similar. */ void sched_userret_slowpath(struct thread *td) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; tdq_setlowpri(TDQ_SELF(), td); thread_unlock(td); } /* * Handle a stathz tick. This is really only relevant for timeshare * threads. */ void sched_clock(struct thread *td) { struct tdq *tdq; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_SELF(); #ifdef SMP /* * We run the long term load balancer infrequently on the first cpu. */ if (balance_tdq == tdq && smp_started != 0 && rebalance != 0) { if (balance_ticks && --balance_ticks == 0) sched_balance(); } #endif /* * Save the old switch count so we have a record of the last ticks * activity. Initialize the new switch count based on our load. * If there is some activity seed it to reflect that. */ tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt; tdq->tdq_switchcnt = tdq->tdq_load; /* * Advance the insert index once for each tick to ensure that all * threads get a chance to run. */ if (tdq->tdq_idx == tdq->tdq_ridx) { tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) tdq->tdq_ridx = tdq->tdq_idx; } ts = td_get_sched(td); sched_pctcpu_update(ts, 1); if (td->td_pri_class & PRI_FIFO_BIT) return; if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) { /* * We used a tick; charge it to the thread so * that we can compute our interactivity. */ td_get_sched(td)->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) { ts->ts_slice = 0; td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; } } u_int sched_estcpu(struct thread *td __unused) { return (0); } /* * Return whether the current CPU has runnable tasks. Used for in-kernel * cooperative idle threads. */ int sched_runnable(void) { struct tdq *tdq; int load; load = 1; tdq = TDQ_SELF(); if ((curthread->td_flags & TDF_IDLETD) != 0) { if (tdq->tdq_load > 0) goto out; } else if (tdq->tdq_load - 1 > 0) goto out; load = 0; out: return (load); } /* * Choose the highest priority thread to run. The thread is removed from * the run-queue while running however the load remains. For SMP we set * the tdq in the global idle bitmask if it idles here. */ struct thread * sched_choose(void) { struct thread *td; struct tdq *tdq; tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = tdq_choose(tdq); if (td) { tdq_runq_rem(tdq, td); tdq->tdq_lowpri = td->td_priority; return (td); } tdq->tdq_lowpri = PRI_MAX_IDLE; return (PCPU_GET(idlethread)); } /* * Set owepreempt if necessary. Preemption never happens directly in ULE, * we always request it once we exit a critical section. */ static inline void sched_setpreempt(struct thread *td) { struct thread *ctd; int cpri; int pri; THREAD_LOCK_ASSERT(curthread, MA_OWNED); ctd = curthread; pri = td->td_priority; cpri = ctd->td_priority; if (pri < cpri) ctd->td_flags |= TDF_NEEDRESCHED; if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) return; if (!sched_shouldpreempt(pri, cpri, 0)) return; ctd->td_owepreempt = 1; } /* * Add a thread to a thread queue. Select the appropriate runq and add the * thread to it. This is the internal function called when the tdq is * predetermined. */ void tdq_add(struct tdq *tdq, struct thread *td, int flags) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; tdq_runq_add(tdq, td, flags); tdq_load_add(tdq, td); } /* * Select the target thread queue and add a thread to it. Request * preemption or IPI a remote processor if required. */ void sched_add(struct thread *td, int flags) { struct tdq *tdq; #ifdef SMP int cpu; #endif KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Recalculate the priority before we select the target cpu or * run-queue. */ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_priority(td); #ifdef SMP /* * Pick the destination cpu and if it isn't ours transfer to the * target cpu. */ cpu = sched_pickcpu(td, flags); tdq = sched_setcpu(td, cpu, flags); tdq_add(tdq, td, flags); if (cpu != PCPU_GET(cpuid)) { tdq_notify(tdq, td); return; } #else tdq = TDQ_SELF(); TDQ_LOCK(tdq); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ thread_lock_set(td, TDQ_LOCKPTR(tdq)); tdq_add(tdq, td, flags); #endif if (!(flags & SRQ_YIELDING)) sched_setpreempt(td); } /* * Remove a thread from a run-queue without running it. This is used * when we're stealing a thread from a remote queue. Otherwise all threads * exit by calling sched_exit_thread() and sched_throw() themselves. */ void sched_rem(struct thread *td) { struct tdq *tdq; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); tdq = TDQ_CPU(td_get_sched(td)->ts_cpu); TDQ_LOCK_ASSERT(tdq, MA_OWNED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); tdq_runq_rem(tdq, td); tdq_load_rem(tdq, td); TD_SET_CAN_RUN(td); if (td->td_priority == tdq->tdq_lowpri) tdq_setlowpri(tdq, NULL); } /* * Fetch cpu utilization information. Updates on demand. */ fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct td_sched *ts; pctcpu = 0; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(ts, TD_IS_RUNNING(td)); if (ts->ts_ticks) { int rtick; /* How many rtick per second ? */ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; } return (pctcpu); } /* * Enforce affinity settings for a thread. Called after adjustments to * cpumask. */ void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); if (THREAD_CAN_SCHED(td, ts->ts_cpu)) return; if (TD_ON_RUNQ(td)) { sched_rem(td); sched_add(td, SRQ_BORING); return; } if (!TD_IS_RUNNING(td)) return; /* * Force a switch before returning to userspace. If the * target thread is not running locally send an ipi to force * the issue. */ td->td_flags |= TDF_NEEDRESCHED; if (td != curthread) ipi_cpu(ts->ts_cpu, IPI_PREEMPT); #endif } /* * Bind a thread to a target cpu. */ void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td_get_sched(td); if (ts->ts_flags & TSF_BOUND) sched_unbind(td); KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td)); ts->ts_flags |= TSF_BOUND; sched_pin(); if (PCPU_GET(cpuid) == cpu) return; ts->ts_cpu = cpu; /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL, NULL); } /* * Release a bound thread. */ void sched_unbind(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); ts = td_get_sched(td); if ((ts->ts_flags & TSF_BOUND) == 0) return; ts->ts_flags &= ~TSF_BOUND; sched_unpin(); } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td_get_sched(td)->ts_flags & TSF_BOUND); } /* * Basic yield call. */ void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); } /* * Return the total system load. */ int sched_load(void) { #ifdef SMP int total; int i; total = 0; CPU_FOREACH(i) total += TDQ_CPU(i)->tdq_sysload; return (total); #else return (TDQ_SELF()->tdq_sysload); #endif } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } #ifdef SMP #define TDQ_IDLESPIN(tdq) \ ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0) #else #define TDQ_IDLESPIN(tdq) 1 #endif /* * The actual idle process. */ void sched_idletd(void *dummy) { struct thread *td; struct tdq *tdq; int oldswitchcnt, switchcnt; int i; mtx_assert(&Giant, MA_NOTOWNED); td = curthread; tdq = TDQ_SELF(); THREAD_NO_SLEEPING(); oldswitchcnt = -1; for (;;) { if (tdq->tdq_load) { thread_lock(td); mi_switch(SW_VOL | SWT_IDLE, NULL); thread_unlock(td); } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #ifdef SMP if (always_steal || switchcnt != oldswitchcnt) { oldswitchcnt = switchcnt; if (tdq_idled(tdq) == 0) continue; } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #else oldswitchcnt = switchcnt; #endif /* * If we're switching very frequently, spin while checking * for load rather than entering a low power state that * may require an IPI. However, don't do any busy * loops while on SMT machines as this simply steals * cycles from cores doing useful work. */ if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) { for (i = 0; i < sched_idlespins; i++) { if (tdq->tdq_load) break; cpu_spinwait(); } } /* If there was context switch during spin, restart it. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt) continue; /* Run main MD idle handler. */ tdq->tdq_cpu_idle = 1; /* * Make sure that tdq_cpu_idle update is globally visible * before cpu_idle() read tdq_load. The order is important * to avoid race with tdq_notify. */ atomic_thread_fence_seq_cst(); /* * Checking for again after the fence picks up assigned * threads often enough to make it worthwhile to do so in * order to avoid calling cpu_idle(). */ if (tdq->tdq_load != 0) { tdq->tdq_cpu_idle = 0; continue; } cpu_idle(switchcnt * 4 > sched_idlespinthresh); tdq->tdq_cpu_idle = 0; /* * Account thread-less hardware interrupts and * other wakeup reasons equal to context switches. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (switchcnt != oldswitchcnt) continue; tdq->tdq_switchcnt++; oldswitchcnt++; } } /* * A CPU is entering for the first time or a thread is exiting. */ void sched_throw(struct thread *td) { struct thread *newtd; struct tdq *tdq; if (td == NULL) { /* Correct spinlock nesting and acquire the correct lock. */ tdq = TDQ_SELF(); TDQ_LOCK(tdq); spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } else { tdq = TDQ_SELF(); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); tdq_load_rem(tdq, td); lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; } KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); newtd = choosethread(); TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; cpu_throw(td, newtd); /* doesn't return */ } /* * This is called from fork_exit(). Just acquire the correct locks and * let fork do the rest of the work. */ void sched_fork_exit(struct thread *td) { struct tdq *tdq; int cpuid; /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with the scheduler lock held. */ cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); if (TD_IS_IDLETHREAD(td)) td->td_lock = TDQ_LOCKPTR(tdq); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); lock_profile_obtain_lock_success( &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); SDT_PROBE0(sched, , , on__cpu); } /* * Create on first use to catch odd startup conditons. */ char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td_get_sched(td); if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); ts->ts_name[0] = '\0'; } #endif #ifdef SMP /* * Build the CPU topology dump string. Is recursively called to collect * the topology tree. */ static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent) { char cpusetbuf[CPUSETBUFSIZ]; int i, first; sbuf_printf(sb, "%*s\n", indent, "", 1 + indent / 2, cg->cg_level); sbuf_printf(sb, "%*s ", indent, "", cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask)); first = TRUE; for (i = 0; i < MAXCPU; i++) { if (CPU_ISSET(i, &cg->cg_mask)) { if (!first) sbuf_printf(sb, ", "); else first = FALSE; sbuf_printf(sb, "%d", i); } } sbuf_printf(sb, "\n"); if (cg->cg_flags != 0) { sbuf_printf(sb, "%*s ", indent, ""); if ((cg->cg_flags & CG_FLAG_HTT) != 0) sbuf_printf(sb, "HTT group"); if ((cg->cg_flags & CG_FLAG_THREAD) != 0) sbuf_printf(sb, "THREAD group"); if ((cg->cg_flags & CG_FLAG_SMT) != 0) sbuf_printf(sb, "SMT group"); sbuf_printf(sb, "\n"); } if (cg->cg_children > 0) { sbuf_printf(sb, "%*s \n", indent, ""); for (i = 0; i < cg->cg_children; i++) sysctl_kern_sched_topology_spec_internal(sb, &cg->cg_child[i], indent+2); sbuf_printf(sb, "%*s \n", indent, ""); } sbuf_printf(sb, "%*s\n", indent, ""); return (0); } /* * Sysctl handler for retrieving topology dump. It's a wrapper for * the recursive sysctl_kern_smp_topology_spec_internal(). */ static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS) { struct sbuf *topo; int err; KASSERT(cpu_top != NULL, ("cpu_top isn't initialized")); topo = sbuf_new_for_sysctl(NULL, NULL, 512, req); if (topo == NULL) return (ENOMEM); sbuf_printf(topo, "\n"); err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1); sbuf_printf(topo, "\n"); if (err == 0) { err = sbuf_finish(topo); } sbuf_delete(topo); return (err); } #endif static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0, "Maximal (lowest) priority for preemption"); SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0, "Assign static kernel priorities to sleeping threads"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, "Number of times idle thread will spin waiting for new work"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh, 0, "Threshold before we will permit idle thread spinning"); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, &balance_interval, 0, "Average period in stathz ticks to run the long-term balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, "Attempts to steal work from other cores before idling"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, "Minimum load on remote CPU before we'll steal"); SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit, 0, "Topological distance limit for stealing threads in sched_switch()"); SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0, "Always run the stealer from the idle thread"); SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", "XML dump of detected CPU topology"); #endif /* ps compat. All cpu percentages from ULE are weighted. */ static int ccpu = 0; SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); Index: stable/12/sys/kern/vfs_bio.c =================================================================== --- stable/12/sys/kern/vfs_bio.c (revision 355609) +++ stable/12/sys/kern/vfs_bio.c (revision 355610) @@ -1,5474 +1,5474 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; struct bufqueue { struct mtx_padalign bq_lock; TAILQ_HEAD(, buf) bq_queue; uint8_t bq_index; uint16_t bq_subqueue; int bq_len; } __aligned(CACHE_LINE_SIZE); #define BQ_LOCKPTR(bq) (&(bq)->bq_lock) #define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) #define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) #define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) struct bufdomain { struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ struct bufqueue bd_dirtyq; struct bufqueue *bd_cleanq; struct mtx_padalign bd_run_lock; /* Constants */ long bd_maxbufspace; long bd_hibufspace; long bd_lobufspace; long bd_bufspacethresh; int bd_hifreebuffers; int bd_lofreebuffers; int bd_hidirtybuffers; int bd_lodirtybuffers; int bd_dirtybufthresh; int bd_lim; /* atomics */ int bd_wanted; int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; int __aligned(CACHE_LINE_SIZE) bd_running; long __aligned(CACHE_LINE_SIZE) bd_bufspace; int __aligned(CACHE_LINE_SIZE) bd_freebuffers; } __aligned(CACHE_LINE_SIZE); #define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) #define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) #define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) #define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) #define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) #define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) #define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) #define BD_DOMAIN(bd) (bd - bdomain) static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ -caddr_t unmapped_buf; +caddr_t __read_mostly unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, struct bufdomain *, int); static int flushbufqueues(struct vnode *, struct bufdomain *, int, int); static void buf_daemon(void); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); static inline struct bufdomain *bufdomain(struct buf *); static void bq_remove(struct bufqueue *bq, struct buf *bp); static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); static int buf_recycle(struct bufdomain *, bool kva); static void bq_init(struct bufqueue *bq, int qindex, int cpu, const char *lockname); static void bd_init(struct bufdomain *bd); static int bd_flushall(struct bufdomain *bd); static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); static counter_u64_t bufkvaspace; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", "Bufspace consumed before waking the daemon to free some"); static counter_u64_t buffreekvacnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); static counter_u64_t bufdefragcnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I", "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I", "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I", "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I", "Target number of free buffers"); static int hifreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I", "Threshold for clean buffer recycling"); static counter_u64_t getnewbufcalls; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, "Number of calls to getnewbuf"); static counter_u64_t getnewbufrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); static counter_u64_t mappingrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static counter_u64_t numbufallocfails; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static counter_u64_t notbufdflushes; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign __exclusive_cache_line bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ /* Maximum number of buffer domains. */ #define BUF_DOMAINS 8 struct bufdomainset bdlodirty; /* Domains > lodirty */ struct bufdomainset bdhidirty; /* Domains > hidirty */ /* Configured number of clean queues. */ static int __read_mostly buf_domains; BITSET_DEFINE(bufdomainset, BUF_DOMAINS); struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; struct bufqueue __exclusive_cache_line bqempty; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) { int error; int value; int i; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(int *)arg1 = value; for (i = 0; i < buf_domains; i++) *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) { long value; int error; int i; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(long *)arg1 = value; for (i = 0; i < buf_domains; i++) *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #else static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; return (sysctl_handle_long(oidp, &lvalue, 0, req)); } #endif static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS) { int value; int i; value = 0; for (i = 0; i < buf_domains; i++) value += bdomain[i].bd_numdirtybuffers; return (sysctl_handle_int(oidp, &value, 0, req)); } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bd_clear: * * Clear a domain from the appropriate bitsets when dirtybuffers * is decremented. */ static void bd_clear(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bd_set: * * Set a domain in the appropriate bitsets when dirtybuffers * is incremented. */ static void bd_set(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(struct buf *bp) { struct bufdomain *bd; int num; bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bdirtywakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_clear(bd); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(struct buf *bp) { struct bufdomain *bd; int num; /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bd_wakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_set(bd); } /* * bufspace_daemon_wakeup: * * Wakeup the daemons responsible for freeing clean bufs. */ static void bufspace_daemon_wakeup(struct bufdomain *bd) { /* * avoid the lock if the daemon is running. */ if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 1); wakeup(&bd->bd_running); BD_RUN_UNLOCK(bd); } } /* * bufspace_daemon_wait: * * Sleep until the domain falls below a limit or one second passes. */ static void bufspace_daemon_wait(struct bufdomain *bd) { /* * Re-check our limits and sleep. bd_running must be * cleared prior to checking the limits to avoid missed * wakeups. The waker will adjust one of bufspace or * freebuffers prior to checking bd_running. */ BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 0); if (bd->bd_bufspace < bd->bd_bufspacethresh && bd->bd_freebuffers > bd->bd_lofreebuffers) { msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP, "-", hz); } else { /* Avoid spurious wakeups while running. */ atomic_store_int(&bd->bd_running, 1); BD_RUN_UNLOCK(bd); } } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); bd = bufdomain(bp); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bd->bd_bufspace, -diff); } else if (diff > 0) { space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && space + diff >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { long limit, new; long space; if (metadata) limit = bd->bd_maxbufspace; else limit = bd->bd_hibufspace; space = atomic_fetchadd_long(&bd->bd_bufspace, size); new = space + size; if (new > limit) { atomic_subtract_long(&bd->bd_bufspace, size); return (ENOSPC); } /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(struct bufdomain *bd, int size) { atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. bd_wanted must be set prior to polling for space. The * operation must be re-tried on return. */ static void bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; BD_LOCK(bd); while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, bd, flushbufqtarget); td->td_pflags &= norunbuf; BD_LOCK(bd); if (fl != 0) continue; if (bd->bd_wanted == 0) break; } error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } BD_UNLOCK(bd); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void *arg) { struct bufdomain *bd; EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, SHUTDOWN_PRI_LAST + 100); bd = arg; for (;;) { kthread_suspend_check(); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bd->bd_bufspace > bd->bd_lobufspace || bd->bd_freebuffers < bd->bd_hifreebuffers) { if (buf_recycle(bd, false) != 0) { if (bd_flushall(bd)) continue; /* * Speedup dirty if we've run out of clean * buffers. This is possible in particular * because softdep may held many bufs locked * pending writes to other bufs which are * marked for delayed write, exhausting * clean space until they are written. */ bd_speedup(); BD_LOCK(bd); if (bd->bd_wanted) { msleep(&bd->bd_wanted, BD_LOCKPTR(bd), PRIBIO|PDROP, "bufspace", hz/10); } else BD_UNLOCK(bd); } maybe_yield(); } bufspace_daemon_wait(bd); } } /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > MAXPHYS) maxbcachebuf = MAXPHYS; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = min(nbuf / 4, 256); TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_NONE; bp->b_domain = -1; bp->b_subqueue = mp_maxid + 1; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); bq_insert(&bqempty, bp, false); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS); for (i = 0 ; i < buf_domains; i++) { struct bufdomain *bd; bd = &bdomain[i]; bd_init(bd); bd->bd_freebuffers = nbuf / buf_domains; bd->bd_hifreebuffers = hifreebuffers / buf_domains; bd->bd_lofreebuffers = lofreebuffers / buf_domains; bd->bd_bufspace = 0; bd->bd_maxbufspace = maxbufspace / buf_domains; bd->bd_hibufspace = hibufspace / buf_domains; bd->bd_lobufspace = lobufspace / buf_domains; bd->bd_bufspacethresh = bufspacethresh / buf_domains; bd->bd_numdirtybuffers = 0; bd->bd_hidirtybuffers = hidirtybuffers / buf_domains; bd->bd_lodirtybuffers = lodirtybuffers / buf_domains; bd->bd_dirtybufthresh = dirtybufthresh / buf_domains; /* Don't allow more than 2% of bufs in the per-cpu caches. */ bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus; } getnewbufcalls = counter_u64_alloc(M_WAITOK); getnewbufrestarts = counter_u64_alloc(M_WAITOK); mappingrestarts = counter_u64_alloc(M_WAITOK); numbufallocfails = counter_u64_alloc(M_WAITOK); notbufdflushes = counter_u64_alloc(M_WAITOK); buffreekvacnt = counter_u64_alloc(M_WAITOK); bufdefragcnt = counter_u64_alloc(M_WAITOK); bufkvaspace = counter_u64_alloc(M_WAITOK); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); #ifdef PREEMPTION /* * Spin for a while to allow interrupt threads to run. */ DELAY(50000 * iter); #else /* * Context switch several times to allow interrupt * threads to run. */ for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL, NULL); thread_unlock(curthread); DELAY(1000); } #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == NULL) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } static inline struct bufdomain * bufdomain(struct buf *bp) { return (&bdomain[bp->b_domain]); } static struct bufqueue * bufqueue(struct buf *bp) { switch (bp->b_qindex) { case QUEUE_NONE: /* FALLTHROUGH */ case QUEUE_SENTINEL: return (NULL); case QUEUE_EMPTY: return (&bqempty); case QUEUE_DIRTY: return (&bufdomain(bp)->bd_dirtyq); case QUEUE_CLEAN: return (&bufdomain(bp)->bd_subq[bp->b_subqueue]); default: break; } panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); } /* * Return the locked bufqueue that bp is a member of. */ static struct bufqueue * bufqueue_acquire(struct buf *bp) { struct bufqueue *bq, *nbq; /* * bp can be pushed from a per-cpu queue to the * cleanq while we're waiting on the lock. Retry * if the queues don't match. */ bq = bufqueue(bp); BQ_LOCK(bq); for (;;) { nbq = bufqueue(bp); if (bq == nbq) break; BQ_UNLOCK(bq); BQ_LOCK(nbq); bq = nbq; } return (bq); } /* * binsfree: * * Insert the buffer into the appropriate free list. Requires a * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { struct bufdomain *bd; struct bufqueue *bq; KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, ("binsfree: Invalid qindex %d", qindex)); BUF_ASSERT_XLOCKED(bp); /* * Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { if (bp->b_qindex == qindex) { bp->b_flags |= B_REUSE; bp->b_flags &= ~B_REMFREE; BUF_UNLOCK(bp); return; } bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } bd = bufdomain(bp); if (qindex == QUEUE_CLEAN) { if (bd->bd_lim != 0) bq = &bd->bd_subq[PCPU_GET(cpuid)]; else bq = bd->bd_cleanq; } else bq = &bd->bd_dirtyq; bq_insert(bq, bp, true); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; bq_remove(&bqempty, bp); store[i] = bp; } BQ_UNLOCK(&bqempty); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { struct bufqueue *bq; struct buf *bp; int i; bq = &bqempty; BQ_LOCK(bq); for (i = 0; i < cnt; i++) { bp = store[i]; /* Inline bq_insert() to batch locking. */ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; } BQ_UNLOCK(bq); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(struct bufdomain *bd) { struct buf *bp; int freebufs; /* * We can only run out of bufs in the buf zone if the average buf * is less than BKVASIZE. In this case the actual wait/block will * come from buf_reycle() failing to flush one of these small bufs. */ bp = NULL; freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); if (freebufs > 0) bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { atomic_add_int(&bd->bd_freebuffers, 1); bufspace_daemon_wakeup(bd); counter_u64_add(numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition below threshold. */ if (freebufs == bd->bd_lofreebuffers) bufspace_daemon_wakeup(bd); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_recycle(struct bufdomain *bd, bool kva) { struct bufqueue *bq; struct buf *bp, *nbp; if (kva) counter_u64_add(bufdefragcnt, 1); nbp = NULL; bq = bd->bd_cleanq; BQ_LOCK(bq); KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), ("buf_recycle: Locks don't match")); nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Implement a second chance algorithm for frequently * accessed buffers. */ if ((bp->b_flags & B_REUSE) != 0) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~B_REUSE; BUF_UNLOCK(bp); continue; } /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == QUEUE_CLEAN, ("buf_recycle: inconsistent queue %d bp %p", bp->b_qindex, bp)); KASSERT(bp->b_domain == BD_DOMAIN(bd), ("getnewbuf: queue domain %d doesn't match request %d", bp->b_domain, (int)BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bq_remove(bq, bp); BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); BQ_LOCK(bq); nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } bd->bd_wanted = 1; BQ_UNLOCK(bq); return (ENOBUFS); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct bufqueue *bq; bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } static void bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname) { mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF); TAILQ_INIT(&bq->bq_queue); bq->bq_len = 0; bq->bq_index = qindex; bq->bq_subqueue = subqueue; } static void bd_init(struct bufdomain *bd) { int i; bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1]; bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock"); bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock"); for (i = 0; i <= mp_maxid; i++) bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, "bufq clean subqueue lock"); mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF); } /* * bq_remove: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bq_remove(struct bufqueue *bq, struct buf *bp) { CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bq_remove: buffer %p not on a queue.", bp)); KASSERT(bufqueue(bp) == bq, ("bq_remove: Remove buffer %p from wrong queue.", bp)); BQ_ASSERT_LOCKED(bq); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } KASSERT(bq->bq_len >= 1, ("queue %d underflow", bp->b_qindex)); TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); bq->bq_len--; bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~(B_REMFREE | B_REUSE); } static void bd_flush(struct bufdomain *bd, struct bufqueue *bq) { struct buf *bp; BQ_ASSERT_LOCKED(bq); if (bq != bd->bd_cleanq) { BD_LOCK(bd); while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp, b_freelist); bp->b_subqueue = bd->bd_cleanq->bq_subqueue; } bd->bd_cleanq->bq_len += bq->bq_len; bq->bq_len = 0; } if (bd->bd_wanted) { bd->bd_wanted = 0; wakeup(&bd->bd_wanted); } if (bq != bd->bd_cleanq) BD_UNLOCK(bd); } static int bd_flushall(struct bufdomain *bd) { struct bufqueue *bq; int flushed; int i; if (bd->bd_lim == 0) return (0); flushed = 0; for (i = 0; i <= mp_maxid; i++) { bq = &bd->bd_subq[i]; if (bq->bq_len == 0) continue; BQ_LOCK(bq); bd_flush(bd, bq); BQ_UNLOCK(bq); flushed++; } return (flushed); } static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock) { struct bufdomain *bd; if (bp->b_qindex != QUEUE_NONE) panic("bq_insert: free buffer %p onto another queue?", bp); bd = bufdomain(bp); if (bp->b_flags & B_AGE) { /* Place this buf directly on the real queue. */ if (bq->bq_index == QUEUE_CLEAN) bq = bd->bd_cleanq; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist); } else { BQ_LOCK(bq); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); } bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; bp->b_subqueue = bq->bq_subqueue; /* * Unlock before we notify so that we don't wakeup a waiter that * fails a trylock on the buf and sleeps again. */ if (unlock) BUF_UNLOCK(bp); if (bp->b_qindex == QUEUE_CLEAN) { /* * Flush the per-cpu queue and notify any waiters. */ if (bd->bd_wanted || (bq != bd->bd_cleanq && bq->bq_len >= bd->bd_lim)) bd_flush(bd, bq); } BQ_UNLOCK(bq); } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); counter_u64_add(bufkvaspace, -bp->b_kvasize); counter_u64_add(buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; counter_u64_add(bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { bool done; int q; int i; done = false; for (i = 0; i < 5; i++) { for (q = 0; q < buf_domains; q++) if (buf_recycle(&bdomain[q], true) != 0) done = true; if (done) break; } return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ static void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) { struct buf *rabp; struct thread *td; int i; td = curthread; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) != 0) { brelse(rabp); continue; } #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ td->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { rabp->b_flags |= B_CKHASH; rabp->b_ckhashcalc = ckhashfunc; } rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; struct thread *td; int error, readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); td = curthread; /* * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags * are specified. */ error = getblkx(vp, blkno, size, 0, 0, flags, &bp); if (error != 0) { *bpp = NULL; return (error); } flags &= ~GB_NOSPARSE; *bpp = bp; /* * If not found in cache, do some I/O */ readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { bp->b_flags |= B_CKHASH; bp->b_ckhashcalc = ckhashfunc; } bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. */ breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); rv = 0; if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) atomic_add_long(&barrierwrites, 1); oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(bp); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(bp); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (buf_dirty_count_severe()) { mtx_lock(&bdirtylock); while (buf_dirty_count_severe()) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty)); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { struct mount *v_mnt; int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are treated as being * transient. * * XXX Treating EIO as transient is not correct; the * contract with the local storage device drivers is that * they will only return EIO once the I/O is no longer * retriable. Network I/O also respects this through the * guarantees of TCP and/or the internal retries of NFS. * ENOMEM might be transient, but we also have no way of * knowing when its ok to retry/reschedule. In general, * this entire case should be made obsolete through better * error handling/recovery and resource scheduling. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(bp); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL; if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 || vn_isdisk(bp->b_vp, NULL) || (bp->b_flags & B_DELWRI) == 0)) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT); /* binsfree unlocks bp. */ binsfree(bp, qindex); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } buf_track(bp, __func__); /* binsfree unlocks bp. */ binsfree(bp, qindex); return; out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp __unused; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int flags, i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh", true); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vm_page_release_locked(m, flags); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int flags, i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); /* * The object lock is needed only if we will attempt to free pages. */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; if ((bp->b_flags & B_DIRECT) != 0) { flags |= VPR_TRYFREE; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); } else { obj = NULL; } for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; if (obj != NULL) vm_page_release_locked(m, flags); else vm_page_release(m, flags); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); if (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ (void)vm_page_grab_pages(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); bp->b_npages = desiredpages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct bufdomain *bd; struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; if (vp == NULL) bd = &bdomain[0]; else bd = &bdomain[vp->v_bufobj.bo_domain]; counter_u64_add(getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(bd, maxsize, metadata) != 0) { counter_u64_add(getnewbufrestarts, 1); continue; } reserved = true; if ((bp = buf_alloc(bd)) == NULL) { counter_u64_add(getnewbufrestarts, 1); continue; } if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while (buf_recycle(bd, false) == 0); if (reserved) bufspace_release(bd, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(bd, vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, struct bufdomain *bd, int target) { int flushed; flushed = flushbufqueues(vp, bd, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, bd, target, 1); } return (flushed); } static void buf_daemon() { struct bufdomain *bd; int speedupreq; int lodirty; int i; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, SHUTDOWN_PRI_LAST + 100); /* * Start the buf clean daemons as children threads. */ for (i = 0 ; i < buf_domains; i++) { int error; error = kthread_add((void (*)(void *))bufspace_daemon, &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); if (error) panic("error %d spawning bufspace daemon", error); } /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kthread_suspend_check(); /* * Save speedupreq for this pass and reset to capture new * requests. */ speedupreq = bd_speedupreq; bd_speedupreq = 0; /* * Flush each domain sequentially according to its level and * the speedup request. */ for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; if (speedupreq) lodirty = bd->bd_numdirtybuffers / 2; else lodirty = bd->bd_lodirtybuffers; while (bd->bd_numdirtybuffers > lodirty) { if (buf_flush(NULL, bd, bd->bd_numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target, int flushdeps) { struct bufqueue *bq; struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int error; bool unlock; flushed = 0; bq = &bd->bd_dirtyq; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); while (flushed != target) { maybe_yield(); BQ_LOCK(bq); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel, b_freelist); } else { BQ_UNLOCK(bq); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { BQ_UNLOCK(bq); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); BQ_UNLOCK(bq); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); counter_u64_add(notbufdflushes, 1); } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } BQ_LOCK(bq); TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } counter_u64_add(mappingrestarts, 1); bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; int error; error = getblkx(vp, blkno, size, slpflag, slptimeo, flags, &bp); if (error != 0) return (NULL); return (bp); } /* * getblkx: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ int getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags, struct buf **bpp) { struct buf *bp; struct bufobj *bo; daddr_t d_blkno; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; d_blkno = blkno; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if ((flags & GB_LOCK_NOWAIT) != 0) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error != 0) return (error); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return (EEXIST); bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); if ((flags & GB_NOSPARSE) != 0 && vmio && !vn_isdisk(vp, NULL)) { error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0); KASSERT(error != EOPNOTSUPP, ("GB_NOSPARSE from fs not supporting bmap, vp %p", vp)); if (error != 0) return (error); if (d_blkno == -1) return (EJUSTRETURN); } bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return (ETIMEDOUT); /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; bufspace_release(bufdomain(bp), maxsize); brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_lblkno = blkno; bp->b_blkno = d_blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); *bpp = bp; return (0); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; BUF_ASSERT_HELD(bp); if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; static struct bio_queue nondump_bios; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); /* * Avoid completing I/O when dumping after a panic since that may * result in a deadlock in the filesystem or pager code. Note that * this doesn't affect dumps that were started manually since we aim * to keep the system usable after it has been resumed. */ if (__predict_false(dumping && SCHEDULER_STOPPED())) { TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue); return; } if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * bufdone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if ((bp->b_flags & B_CKHASH) != 0) { KASSERT(bp->b_iocmd == BIO_READ, ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd)); KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp)); (*bp->b_ckhashcalc)(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); if (dropobj) bufobj_wdrop(dropobj); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage", true); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & IO_EXT) != 0) bp->b_xflags |= BX_ALTDATA; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | VM_ALLOC_WAITOK); pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; p->wire_count--; vm_page_free(p); } vm_wire_sub(bp->b_npages - newnpages); bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i __unused; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } /* * Initialize a struct bufobj before use. Memory is assumed zero filled. */ void bufobj_init(struct bufobj *bo, void *private) { static volatile int bufobj_cleanq; bo->bo_domain = atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_private = private; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bsize; int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; error = 0; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); /* * Change the meaning of la from where the last requested page starts * to where it ends, because that's the end of the requested region * and the start of the potential read-ahead region. */ la += PAGE_SIZE; lpart = la > object->un_pager.vnp.vnp_size; bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; VM_OBJECT_WLOCK(object); again: for (i = 0; i < count; i++) vm_page_busy_downgrade(ma[i]); VM_OBJECT_WUNLOCK(object); lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (m->valid == VM_PAGE_BITS_ALL) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; bsize = get_blksize(vp, lbn); error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || m->valid != VM_PAGE_BITS_ALL) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || m->valid == VM_PAGE_BITS_ALL || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { VM_OBJECT_WLOCK(object); if (m->valid != 0 && m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, TRUE); VM_OBJECT_WUNLOCK(object); } next_page:; } end_pages: VM_OBJECT_WLOCK(object); redo = false; for (i = 0; i < count; i++) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (ma[i]->valid != VM_PAGE_BITS_ALL) redo = true; } if (redo && error == 0) goto again; VM_OBJECT_WUNLOCK(object); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } BUF_LOCKPRINTINFO(bp); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); } DB_SHOW_COMMAND(bufqueues, bufqueues) { struct bufdomain *bd; struct buf *bp; long total; int i, j, cnt; db_printf("bqempty: %d\n", bqempty.bq_len); for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; db_printf("Buf domain %d\n", i); db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers); db_printf("\n"); db_printf("\tbufspace\t%ld\n", bd->bd_bufspace); db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace); db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace); db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); db_printf("\n"); db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); db_printf("\n"); total = 0; TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tcleanq count\t%d (%ld)\n", bd->bd_cleanq->bq_len, total); total = 0; TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tdirtyq count\t%d (%ld)\n", bd->bd_dirtyq.bq_len, total); db_printf("\twakeup\t\t%d\n", bd->bd_wanted); db_printf("\tlim\t\t%d\n", bd->bd_lim); db_printf("\tCPU "); for (j = 0; j <= mp_maxid; j++) db_printf("%d, ", bd->bd_subq[j].bq_len); db_printf("\n"); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) { cnt++; total += buf[j].b_bufsize; } db_printf("\tLocked buffers: %d space %ld\n", cnt, total); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) if (buf[j].b_domain == i) { cnt++; total += buf[j].b_bufsize; } db_printf("\tTotal buffers: %d space %ld\n", cnt, total); } } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: stable/12 =================================================================== --- stable/12 (revision 355609) +++ stable/12 (revision 355610) Property changes on: stable/12 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r355404