Index: head/sys/cam/cam_periph.c =================================================================== --- head/sys/cam/cam_periph.c (revision 364441) +++ head/sys/cam/cam_periph.c (revision 364442) @@ -1,2182 +1,2182 @@ /*- * Common functions for CAM "type" (peripheral) drivers. * * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, 1998 Justin T. Gibbs. * Copyright (c) 1997, 1998, 1999, 2000 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static u_int camperiphnextunit(struct periph_driver *p_drv, u_int newunit, int wired, path_id_t pathid, target_id_t target, lun_id_t lun); static u_int camperiphunit(struct periph_driver *p_drv, path_id_t pathid, target_id_t target, lun_id_t lun); static void camperiphdone(struct cam_periph *periph, union ccb *done_ccb); static void camperiphfree(struct cam_periph *periph); static int camperiphscsistatuserror(union ccb *ccb, union ccb **orig_ccb, cam_flags camflags, u_int32_t sense_flags, int *openings, u_int32_t *relsim_flags, u_int32_t *timeout, u_int32_t *action, const char **action_string); static int camperiphscsisenseerror(union ccb *ccb, union ccb **orig_ccb, cam_flags camflags, u_int32_t sense_flags, int *openings, u_int32_t *relsim_flags, u_int32_t *timeout, u_int32_t *action, const char **action_string); static void cam_periph_devctl_notify(union ccb *ccb); static int nperiph_drivers; static int initialized = 0; struct periph_driver **periph_drivers; static MALLOC_DEFINE(M_CAMPERIPH, "CAM periph", "CAM peripheral buffers"); static int periph_selto_delay = 1000; TUNABLE_INT("kern.cam.periph_selto_delay", &periph_selto_delay); static int periph_noresrc_delay = 500; TUNABLE_INT("kern.cam.periph_noresrc_delay", &periph_noresrc_delay); static int periph_busy_delay = 500; TUNABLE_INT("kern.cam.periph_busy_delay", &periph_busy_delay); static u_int periph_mapmem_thresh = 65536; SYSCTL_UINT(_kern_cam, OID_AUTO, mapmem_thresh, CTLFLAG_RWTUN, &periph_mapmem_thresh, 0, "Threshold for user-space buffer mapping"); void periphdriver_register(void *data) { struct periph_driver *drv = (struct periph_driver *)data; struct periph_driver **newdrivers, **old; int ndrivers; again: ndrivers = nperiph_drivers + 2; newdrivers = malloc(sizeof(*newdrivers) * ndrivers, M_CAMPERIPH, M_WAITOK); xpt_lock_buses(); if (ndrivers != nperiph_drivers + 2) { /* * Lost race against itself; go around. */ xpt_unlock_buses(); free(newdrivers, M_CAMPERIPH); goto again; } if (periph_drivers) bcopy(periph_drivers, newdrivers, sizeof(*newdrivers) * nperiph_drivers); newdrivers[nperiph_drivers] = drv; newdrivers[nperiph_drivers + 1] = NULL; old = periph_drivers; periph_drivers = newdrivers; nperiph_drivers++; xpt_unlock_buses(); if (old) free(old, M_CAMPERIPH); /* If driver marked as early or it is late now, initialize it. */ if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) || initialized > 1) (*drv->init)(); } int periphdriver_unregister(void *data) { struct periph_driver *drv = (struct periph_driver *)data; int error, n; /* If driver marked as early or it is late now, deinitialize it. */ if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) || initialized > 1) { if (drv->deinit == NULL) { printf("CAM periph driver '%s' doesn't have deinit.\n", drv->driver_name); return (EOPNOTSUPP); } error = drv->deinit(); if (error != 0) return (error); } xpt_lock_buses(); for (n = 0; n < nperiph_drivers && periph_drivers[n] != drv; n++) ; KASSERT(n < nperiph_drivers, ("Periph driver '%s' was not registered", drv->driver_name)); for (; n + 1 < nperiph_drivers; n++) periph_drivers[n] = periph_drivers[n + 1]; periph_drivers[n + 1] = NULL; nperiph_drivers--; xpt_unlock_buses(); return (0); } void periphdriver_init(int level) { int i, early; initialized = max(initialized, level); for (i = 0; periph_drivers[i] != NULL; i++) { early = (periph_drivers[i]->flags & CAM_PERIPH_DRV_EARLY) ? 1 : 2; if (early == initialized) (*periph_drivers[i]->init)(); } } cam_status cam_periph_alloc(periph_ctor_t *periph_ctor, periph_oninv_t *periph_oninvalidate, periph_dtor_t *periph_dtor, periph_start_t *periph_start, char *name, cam_periph_type type, struct cam_path *path, ac_callback_t *ac_callback, ac_code code, void *arg) { struct periph_driver **p_drv; struct cam_sim *sim; struct cam_periph *periph; struct cam_periph *cur_periph; path_id_t path_id; target_id_t target_id; lun_id_t lun_id; cam_status status; u_int init_level; init_level = 0; /* * Handle Hot-Plug scenarios. If there is already a peripheral * of our type assigned to this path, we are likely waiting for * final close on an old, invalidated, peripheral. If this is * the case, queue up a deferred call to the peripheral's async * handler. If it looks like a mistaken re-allocation, complain. */ if ((periph = cam_periph_find(path, name)) != NULL) { if ((periph->flags & CAM_PERIPH_INVALID) != 0 && (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) == 0) { periph->flags |= CAM_PERIPH_NEW_DEV_FOUND; periph->deferred_callback = ac_callback; periph->deferred_ac = code; return (CAM_REQ_INPROG); } else { printf("cam_periph_alloc: attempt to re-allocate " "valid device %s%d rejected flags %#x " "refcount %d\n", periph->periph_name, periph->unit_number, periph->flags, periph->refcount); } return (CAM_REQ_INVALID); } periph = (struct cam_periph *)malloc(sizeof(*periph), M_CAMPERIPH, M_NOWAIT|M_ZERO); if (periph == NULL) return (CAM_RESRC_UNAVAIL); init_level++; sim = xpt_path_sim(path); path_id = xpt_path_path_id(path); target_id = xpt_path_target_id(path); lun_id = xpt_path_lun_id(path); periph->periph_start = periph_start; periph->periph_dtor = periph_dtor; periph->periph_oninval = periph_oninvalidate; periph->type = type; periph->periph_name = name; periph->scheduled_priority = CAM_PRIORITY_NONE; periph->immediate_priority = CAM_PRIORITY_NONE; periph->refcount = 1; /* Dropped by invalidation. */ periph->sim = sim; SLIST_INIT(&periph->ccb_list); status = xpt_create_path(&path, periph, path_id, target_id, lun_id); if (status != CAM_REQ_CMP) goto failure; periph->path = path; xpt_lock_buses(); for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { if (strcmp((*p_drv)->driver_name, name) == 0) break; } if (*p_drv == NULL) { printf("cam_periph_alloc: invalid periph name '%s'\n", name); xpt_unlock_buses(); xpt_free_path(periph->path); free(periph, M_CAMPERIPH); return (CAM_REQ_INVALID); } periph->unit_number = camperiphunit(*p_drv, path_id, target_id, lun_id); cur_periph = TAILQ_FIRST(&(*p_drv)->units); while (cur_periph != NULL && cur_periph->unit_number < periph->unit_number) cur_periph = TAILQ_NEXT(cur_periph, unit_links); if (cur_periph != NULL) { KASSERT(cur_periph->unit_number != periph->unit_number, ("duplicate units on periph list")); TAILQ_INSERT_BEFORE(cur_periph, periph, unit_links); } else { TAILQ_INSERT_TAIL(&(*p_drv)->units, periph, unit_links); (*p_drv)->generation++; } xpt_unlock_buses(); init_level++; status = xpt_add_periph(periph); if (status != CAM_REQ_CMP) goto failure; init_level++; CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph created\n")); status = periph_ctor(periph, arg); if (status == CAM_REQ_CMP) init_level++; failure: switch (init_level) { case 4: /* Initialized successfully */ break; case 3: CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n")); xpt_remove_periph(periph); /* FALLTHROUGH */ case 2: xpt_lock_buses(); TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links); xpt_unlock_buses(); xpt_free_path(periph->path); /* FALLTHROUGH */ case 1: free(periph, M_CAMPERIPH); /* FALLTHROUGH */ case 0: /* No cleanup to perform. */ break; default: panic("%s: Unknown init level", __func__); } return(status); } /* * Find a peripheral structure with the specified path, target, lun, * and (optionally) type. If the name is NULL, this function will return * the first peripheral driver that matches the specified path. */ struct cam_periph * cam_periph_find(struct cam_path *path, char *name) { struct periph_driver **p_drv; struct cam_periph *periph; xpt_lock_buses(); for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { if (name != NULL && (strcmp((*p_drv)->driver_name, name) != 0)) continue; TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) { if (xpt_path_comp(periph->path, path) == 0) { xpt_unlock_buses(); cam_periph_assert(periph, MA_OWNED); return(periph); } } if (name != NULL) { xpt_unlock_buses(); return(NULL); } } xpt_unlock_buses(); return(NULL); } /* * Find peripheral driver instances attached to the specified path. */ int cam_periph_list(struct cam_path *path, struct sbuf *sb) { struct sbuf local_sb; struct periph_driver **p_drv; struct cam_periph *periph; int count; int sbuf_alloc_len; sbuf_alloc_len = 16; retry: sbuf_new(&local_sb, NULL, sbuf_alloc_len, SBUF_FIXEDLEN); count = 0; xpt_lock_buses(); for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) { if (xpt_path_comp(periph->path, path) != 0) continue; if (sbuf_len(&local_sb) != 0) sbuf_cat(&local_sb, ","); sbuf_printf(&local_sb, "%s%d", periph->periph_name, periph->unit_number); if (sbuf_error(&local_sb) == ENOMEM) { sbuf_alloc_len *= 2; xpt_unlock_buses(); sbuf_delete(&local_sb); goto retry; } count++; } } xpt_unlock_buses(); sbuf_finish(&local_sb); if (sbuf_len(sb) != 0) sbuf_cat(sb, ","); sbuf_cat(sb, sbuf_data(&local_sb)); sbuf_delete(&local_sb); return (count); } int cam_periph_acquire(struct cam_periph *periph) { int status; if (periph == NULL) return (EINVAL); status = ENOENT; xpt_lock_buses(); if ((periph->flags & CAM_PERIPH_INVALID) == 0) { periph->refcount++; status = 0; } xpt_unlock_buses(); return (status); } void cam_periph_doacquire(struct cam_periph *periph) { xpt_lock_buses(); KASSERT(periph->refcount >= 1, ("cam_periph_doacquire() with refcount == %d", periph->refcount)); periph->refcount++; xpt_unlock_buses(); } void cam_periph_release_locked_buses(struct cam_periph *periph) { cam_periph_assert(periph, MA_OWNED); KASSERT(periph->refcount >= 1, ("periph->refcount >= 1")); if (--periph->refcount == 0) camperiphfree(periph); } void cam_periph_release_locked(struct cam_periph *periph) { if (periph == NULL) return; xpt_lock_buses(); cam_periph_release_locked_buses(periph); xpt_unlock_buses(); } void cam_periph_release(struct cam_periph *periph) { struct mtx *mtx; if (periph == NULL) return; cam_periph_assert(periph, MA_NOTOWNED); mtx = cam_periph_mtx(periph); mtx_lock(mtx); cam_periph_release_locked(periph); mtx_unlock(mtx); } /* * hold/unhold act as mutual exclusion for sections of the code that * need to sleep and want to make sure that other sections that * will interfere are held off. This only protects exclusive sections * from each other. */ int cam_periph_hold(struct cam_periph *periph, int priority) { int error; /* * Increment the reference count on the peripheral * while we wait for our lock attempt to succeed * to ensure the peripheral doesn't disappear out * from user us while we sleep. */ if (cam_periph_acquire(periph) != 0) return (ENXIO); cam_periph_assert(periph, MA_OWNED); while ((periph->flags & CAM_PERIPH_LOCKED) != 0) { periph->flags |= CAM_PERIPH_LOCK_WANTED; if ((error = cam_periph_sleep(periph, periph, priority, "caplck", 0)) != 0) { cam_periph_release_locked(periph); return (error); } if (periph->flags & CAM_PERIPH_INVALID) { cam_periph_release_locked(periph); return (ENXIO); } } periph->flags |= CAM_PERIPH_LOCKED; return (0); } void cam_periph_unhold(struct cam_periph *periph) { cam_periph_assert(periph, MA_OWNED); periph->flags &= ~CAM_PERIPH_LOCKED; if ((periph->flags & CAM_PERIPH_LOCK_WANTED) != 0) { periph->flags &= ~CAM_PERIPH_LOCK_WANTED; wakeup(periph); } cam_periph_release_locked(periph); } /* * Look for the next unit number that is not currently in use for this * peripheral type starting at "newunit". Also exclude unit numbers that * are reserved by for future "hardwiring" unless we already know that this * is a potential wired device. Only assume that the device is "wired" the * first time through the loop since after that we'll be looking at unit * numbers that did not match a wiring entry. */ static u_int camperiphnextunit(struct periph_driver *p_drv, u_int newunit, int wired, path_id_t pathid, target_id_t target, lun_id_t lun) { struct cam_periph *periph; char *periph_name; int i, val, dunit, r; const char *dname, *strval; periph_name = p_drv->driver_name; for (;;newunit++) { for (periph = TAILQ_FIRST(&p_drv->units); periph != NULL && periph->unit_number != newunit; periph = TAILQ_NEXT(periph, unit_links)) ; if (periph != NULL && periph->unit_number == newunit) { if (wired != 0) { xpt_print(periph->path, "Duplicate Wired " "Device entry!\n"); xpt_print(periph->path, "Second device (%s " "device at scbus%d target %d lun %d) will " "not be wired\n", periph_name, pathid, target, lun); wired = 0; } continue; } if (wired) break; /* * Don't match entries like "da 4" as a wired down * device, but do match entries like "da 4 target 5" * or even "da 4 scbus 1". */ i = 0; dname = periph_name; for (;;) { r = resource_find_dev(&i, dname, &dunit, NULL, NULL); if (r != 0) break; /* if no "target" and no specific scbus, skip */ if (resource_int_value(dname, dunit, "target", &val) && (resource_string_value(dname, dunit, "at",&strval)|| strcmp(strval, "scbus") == 0)) continue; if (newunit == dunit) break; } if (r != 0) break; } return (newunit); } static u_int camperiphunit(struct periph_driver *p_drv, path_id_t pathid, target_id_t target, lun_id_t lun) { u_int unit; int wired, i, val, dunit; const char *dname, *strval; char pathbuf[32], *periph_name; periph_name = p_drv->driver_name; snprintf(pathbuf, sizeof(pathbuf), "scbus%d", pathid); unit = 0; i = 0; dname = periph_name; for (wired = 0; resource_find_dev(&i, dname, &dunit, NULL, NULL) == 0; wired = 0) { if (resource_string_value(dname, dunit, "at", &strval) == 0) { if (strcmp(strval, pathbuf) != 0) continue; wired++; } if (resource_int_value(dname, dunit, "target", &val) == 0) { if (val != target) continue; wired++; } if (resource_int_value(dname, dunit, "lun", &val) == 0) { if (val != lun) continue; wired++; } if (wired != 0) { unit = dunit; break; } } /* * Either start from 0 looking for the next unit or from * the unit number given in the resource config. This way, * if we have wildcard matches, we don't return the same * unit number twice. */ unit = camperiphnextunit(p_drv, unit, wired, pathid, target, lun); return (unit); } void cam_periph_invalidate(struct cam_periph *periph) { cam_periph_assert(periph, MA_OWNED); /* * We only tear down the device the first time a peripheral is * invalidated. */ if ((periph->flags & CAM_PERIPH_INVALID) != 0) return; CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph invalidated\n")); if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting) { struct sbuf sb; char buffer[160]; sbuf_new(&sb, buffer, 160, SBUF_FIXEDLEN); xpt_denounce_periph_sbuf(periph, &sb); sbuf_finish(&sb); sbuf_putbuf(&sb); } periph->flags |= CAM_PERIPH_INVALID; periph->flags &= ~CAM_PERIPH_NEW_DEV_FOUND; if (periph->periph_oninval != NULL) periph->periph_oninval(periph); cam_periph_release_locked(periph); } static void camperiphfree(struct cam_periph *periph) { struct periph_driver **p_drv; struct periph_driver *drv; cam_periph_assert(periph, MA_OWNED); KASSERT(periph->periph_allocating == 0, ("%s%d: freed while allocating", periph->periph_name, periph->unit_number)); for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { if (strcmp((*p_drv)->driver_name, periph->periph_name) == 0) break; } if (*p_drv == NULL) { printf("camperiphfree: attempt to free non-existant periph\n"); return; } /* * Cache a pointer to the periph_driver structure. If a * periph_driver is added or removed from the array (see * periphdriver_register()) while we drop the toplogy lock * below, p_drv may change. This doesn't protect against this * particular periph_driver going away. That will require full * reference counting in the periph_driver infrastructure. */ drv = *p_drv; /* * We need to set this flag before dropping the topology lock, to * let anyone who is traversing the list that this peripheral is * about to be freed, and there will be no more reference count * checks. */ periph->flags |= CAM_PERIPH_FREE; /* * The peripheral destructor semantics dictate calling with only the * SIM mutex held. Since it might sleep, it should not be called * with the topology lock held. */ xpt_unlock_buses(); /* * We need to call the peripheral destructor prior to removing the * peripheral from the list. Otherwise, we risk running into a * scenario where the peripheral unit number may get reused * (because it has been removed from the list), but some resources * used by the peripheral are still hanging around. In particular, * the devfs nodes used by some peripherals like the pass(4) driver * aren't fully cleaned up until the destructor is run. If the * unit number is reused before the devfs instance is fully gone, * devfs will panic. */ if (periph->periph_dtor != NULL) periph->periph_dtor(periph); /* * The peripheral list is protected by the topology lock. We have to * remove the periph from the drv list before we call deferred_ac. The * AC_FOUND_DEVICE callback won't create a new periph if it's still there. */ xpt_lock_buses(); TAILQ_REMOVE(&drv->units, periph, unit_links); drv->generation++; xpt_remove_periph(periph); xpt_unlock_buses(); if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting) xpt_print(periph->path, "Periph destroyed\n"); else CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n")); if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) { union ccb ccb; void *arg; switch (periph->deferred_ac) { case AC_FOUND_DEVICE: ccb.ccb_h.func_code = XPT_GDEV_TYPE; xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL); xpt_action(&ccb); arg = &ccb; break; case AC_PATH_REGISTERED: xpt_path_inq(&ccb.cpi, periph->path); arg = &ccb; break; default: arg = NULL; break; } periph->deferred_callback(NULL, periph->deferred_ac, periph->path, arg); } xpt_free_path(periph->path); free(periph, M_CAMPERIPH); xpt_lock_buses(); } /* * Map user virtual pointers into kernel virtual address space, so we can * access the memory. This is now a generic function that centralizes most * of the sanity checks on the data flags, if any. * This also only works for up to MAXPHYS memory. Since we use * buffers to map stuff in and out, we're limited to the buffer size. */ int cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo, u_int maxmap) { int numbufs, i; u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; u_int32_t lengths[CAM_PERIPH_MAXMAPS]; u_int32_t dirs[CAM_PERIPH_MAXMAPS]; bool misaligned[CAM_PERIPH_MAXMAPS]; bzero(mapinfo, sizeof(*mapinfo)); if (maxmap == 0) maxmap = DFLTPHYS; /* traditional default */ else if (maxmap > MAXPHYS) maxmap = MAXPHYS; /* for safety */ switch(ccb->ccb_h.func_code) { case XPT_DEV_MATCH: if (ccb->cdm.match_buf_len == 0) { printf("cam_periph_mapmem: invalid match buffer " "length 0\n"); return(EINVAL); } if (ccb->cdm.pattern_buf_len > 0) { data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns; lengths[0] = ccb->cdm.pattern_buf_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches; lengths[1] = ccb->cdm.match_buf_len; dirs[1] = CAM_DIR_IN; numbufs = 2; } else { data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches; lengths[0] = ccb->cdm.match_buf_len; dirs[0] = CAM_DIR_IN; numbufs = 1; } /* * This request will not go to the hardware, no reason * to be so strict. vmapbuf() is able to map up to MAXPHYS. */ maxmap = MAXPHYS; break; case XPT_SCSI_IO: case XPT_CONT_TARGET_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) return (EINVAL); data_ptrs[0] = &ccb->csio.data_ptr; lengths[0] = ccb->csio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; break; case XPT_ATA_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) return (EINVAL); data_ptrs[0] = &ccb->ataio.data_ptr; lengths[0] = ccb->ataio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; break; case XPT_MMC_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); /* Two mappings: one for cmd->data and one for cmd->data->data */ data_ptrs[0] = (unsigned char **)&ccb->mmcio.cmd.data; lengths[0] = sizeof(struct mmc_data *); dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; data_ptrs[1] = (unsigned char **)&ccb->mmcio.cmd.data->data; lengths[1] = ccb->mmcio.cmd.data->len; dirs[1] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 2; break; case XPT_SMP_IO: data_ptrs[0] = &ccb->smpio.smp_request; lengths[0] = ccb->smpio.smp_request_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = &ccb->smpio.smp_response; lengths[1] = ccb->smpio.smp_response_len; dirs[1] = CAM_DIR_IN; numbufs = 2; break; case XPT_NVME_IO: case XPT_NVME_ADMIN: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return (0); if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) return (EINVAL); data_ptrs[0] = &ccb->nvmeio.data_ptr; lengths[0] = ccb->nvmeio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; break; case XPT_DEV_ADVINFO: if (ccb->cdai.bufsiz == 0) return (0); data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; lengths[0] = ccb->cdai.bufsiz; dirs[0] = CAM_DIR_IN; numbufs = 1; /* * This request will not go to the hardware, no reason * to be so strict. vmapbuf() is able to map up to MAXPHYS. */ maxmap = MAXPHYS; break; default: return(EINVAL); break; /* NOTREACHED */ } /* * Check the transfer length and permissions first, so we don't * have to unmap any previously mapped buffers. */ for (i = 0; i < numbufs; i++) { if (lengths[i] > maxmap) { printf("cam_periph_mapmem: attempt to map %lu bytes, " "which is greater than %lu\n", (long)(lengths[i]), (u_long)maxmap); return (E2BIG); } /* * The userland data pointer passed in may not be page * aligned. vmapbuf() truncates the address to a page * boundary, so if the address isn't page aligned, we'll * need enough space for the given transfer length, plus * whatever extra space is necessary to make it to the page * boundary. */ misaligned[i] = (lengths[i] + (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK) > MAXPHYS); } /* * This keeps the kernel stack of current thread from getting * swapped. In low-memory situations where the kernel stack might * otherwise get swapped out, this holds it and allows the thread * to make progress and release the kernel mapped pages sooner. * * XXX KDM should I use P_NOSWAP instead? */ PHOLD(curproc); for (i = 0; i < numbufs; i++) { /* Save the user's data address. */ mapinfo->orig[i] = *data_ptrs[i]; /* * For small buffers use malloc+copyin/copyout instead of * mapping to KVA to avoid expensive TLB shootdowns. For * small allocations malloc is backed by UMA, and so much * cheaper on SMP systems. */ if ((lengths[i] <= periph_mapmem_thresh || misaligned[i]) && ccb->ccb_h.func_code != XPT_MMC_IO) { *data_ptrs[i] = malloc(lengths[i], M_CAMPERIPH, M_WAITOK); if (dirs[i] != CAM_DIR_IN) { if (copyin(mapinfo->orig[i], *data_ptrs[i], lengths[i]) != 0) { free(*data_ptrs[i], M_CAMPERIPH); *data_ptrs[i] = mapinfo->orig[i]; goto fail; } } else bzero(*data_ptrs[i], lengths[i]); continue; } /* * Get the buffer. */ mapinfo->bp[i] = uma_zalloc(pbuf_zone, M_WAITOK); /* put our pointer in the data slot */ mapinfo->bp[i]->b_data = *data_ptrs[i]; /* set the transfer length, we know it's < MAXPHYS */ mapinfo->bp[i]->b_bufsize = lengths[i]; /* set the direction */ mapinfo->bp[i]->b_iocmd = (dirs[i] == CAM_DIR_OUT) ? BIO_WRITE : BIO_READ; /* Map the buffer into kernel memory. */ if (vmapbuf(mapinfo->bp[i], 1) < 0) { uma_zfree(pbuf_zone, mapinfo->bp[i]); goto fail; } /* set our pointer to the new mapped area */ *data_ptrs[i] = mapinfo->bp[i]->b_data; } /* * Now that we've gotten this far, change ownership to the kernel * of the buffers so that we don't run afoul of returning to user * space with locks (on the buffer) held. */ for (i = 0; i < numbufs; i++) { if (mapinfo->bp[i]) BUF_KERNPROC(mapinfo->bp[i]); } mapinfo->num_bufs_used = numbufs; return(0); fail: for (i--; i >= 0; i--) { if (mapinfo->bp[i]) { vunmapbuf(mapinfo->bp[i]); uma_zfree(pbuf_zone, mapinfo->bp[i]); } else free(*data_ptrs[i], M_CAMPERIPH); *data_ptrs[i] = mapinfo->orig[i]; } PRELE(curproc); return(EACCES); } /* * Unmap memory segments mapped into kernel virtual address space by * cam_periph_mapmem(). */ void cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) { int numbufs, i; u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; u_int32_t lengths[CAM_PERIPH_MAXMAPS]; u_int32_t dirs[CAM_PERIPH_MAXMAPS]; if (mapinfo->num_bufs_used <= 0) { /* nothing to free and the process wasn't held. */ return; } switch (ccb->ccb_h.func_code) { case XPT_DEV_MATCH: if (ccb->cdm.pattern_buf_len > 0) { data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns; lengths[0] = ccb->cdm.pattern_buf_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches; lengths[1] = ccb->cdm.match_buf_len; dirs[1] = CAM_DIR_IN; numbufs = 2; } else { data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches; lengths[0] = ccb->cdm.match_buf_len; dirs[0] = CAM_DIR_IN; numbufs = 1; } break; case XPT_SCSI_IO: case XPT_CONT_TARGET_IO: data_ptrs[0] = &ccb->csio.data_ptr; lengths[0] = ccb->csio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; break; case XPT_ATA_IO: data_ptrs[0] = &ccb->ataio.data_ptr; lengths[0] = ccb->ataio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; break; case XPT_MMC_IO: data_ptrs[0] = (u_int8_t **)&ccb->mmcio.cmd.data; lengths[0] = sizeof(struct mmc_data *); dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; data_ptrs[1] = (u_int8_t **)&ccb->mmcio.cmd.data->data; lengths[1] = ccb->mmcio.cmd.data->len; dirs[1] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 2; break; case XPT_SMP_IO: data_ptrs[0] = &ccb->smpio.smp_request; lengths[0] = ccb->smpio.smp_request_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = &ccb->smpio.smp_response; lengths[1] = ccb->smpio.smp_response_len; dirs[1] = CAM_DIR_IN; numbufs = 2; break; case XPT_NVME_IO: case XPT_NVME_ADMIN: data_ptrs[0] = &ccb->nvmeio.data_ptr; lengths[0] = ccb->nvmeio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; break; case XPT_DEV_ADVINFO: data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; lengths[0] = ccb->cdai.bufsiz; dirs[0] = CAM_DIR_IN; numbufs = 1; break; default: /* allow ourselves to be swapped once again */ PRELE(curproc); return; break; /* NOTREACHED */ } for (i = 0; i < numbufs; i++) { if (mapinfo->bp[i]) { /* unmap the buffer */ vunmapbuf(mapinfo->bp[i]); /* release the buffer */ uma_zfree(pbuf_zone, mapinfo->bp[i]); } else { if (dirs[i] != CAM_DIR_OUT) { copyout(*data_ptrs[i], mapinfo->orig[i], lengths[i]); } free(*data_ptrs[i], M_CAMPERIPH); } /* Set the user's pointer back to the original value */ *data_ptrs[i] = mapinfo->orig[i]; } /* allow ourselves to be swapped once again */ PRELE(curproc); } int cam_periph_ioctl(struct cam_periph *periph, u_long cmd, caddr_t addr, int (*error_routine)(union ccb *ccb, cam_flags camflags, u_int32_t sense_flags)) { union ccb *ccb; int error; int found; error = found = 0; switch(cmd){ case CAMGETPASSTHRU: ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); ccb->ccb_h.func_code = XPT_GDEVLIST; /* * Basically, the point of this is that we go through * getting the list of devices, until we find a passthrough * device. In the current version of the CAM code, the * only way to determine what type of device we're dealing * with is by its name. */ while (found == 0) { ccb->cgdl.index = 0; ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS; while (ccb->cgdl.status == CAM_GDEVLIST_MORE_DEVS) { /* we want the next device in the list */ xpt_action(ccb); if (strncmp(ccb->cgdl.periph_name, "pass", 4) == 0){ found = 1; break; } } if ((ccb->cgdl.status == CAM_GDEVLIST_LAST_DEVICE) && (found == 0)) { ccb->cgdl.periph_name[0] = '\0'; ccb->cgdl.unit_number = 0; break; } } /* copy the result back out */ bcopy(ccb, addr, sizeof(union ccb)); /* and release the ccb */ xpt_release_ccb(ccb); break; default: error = ENOTTY; break; } return(error); } static void cam_periph_done_panic(struct cam_periph *periph, union ccb *done_ccb) { panic("%s: already done with ccb %p", __func__, done_ccb); } static void cam_periph_done(struct cam_periph *periph, union ccb *done_ccb) { /* Caller will release the CCB */ xpt_path_assert(done_ccb->ccb_h.path, MA_OWNED); done_ccb->ccb_h.cbfcnp = cam_periph_done_panic; wakeup(&done_ccb->ccb_h.cbfcnp); } static void cam_periph_ccbwait(union ccb *ccb) { if ((ccb->ccb_h.func_code & XPT_FC_QUEUED) != 0) { while (ccb->ccb_h.cbfcnp != cam_periph_done_panic) xpt_path_sleep(ccb->ccb_h.path, &ccb->ccb_h.cbfcnp, PRIBIO, "cbwait", 0); } KASSERT(ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX && (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG, ("%s: proceeding with incomplete ccb: ccb=%p, func_code=%#x, " "status=%#x, index=%d", __func__, ccb, ccb->ccb_h.func_code, ccb->ccb_h.status, ccb->ccb_h.pinfo.index)); } /* * Dispatch a CCB and wait for it to complete. If the CCB has set a * callback function (ccb->ccb_h.cbfcnp), it will be overwritten and lost. */ int cam_periph_runccb(union ccb *ccb, int (*error_routine)(union ccb *ccb, cam_flags camflags, u_int32_t sense_flags), cam_flags camflags, u_int32_t sense_flags, struct devstat *ds) { struct bintime *starttime; struct bintime ltime; int error; bool must_poll; uint32_t timeout = 1; starttime = NULL; xpt_path_assert(ccb->ccb_h.path, MA_OWNED); KASSERT((ccb->ccb_h.flags & CAM_UNLOCKED) == 0, ("%s: ccb=%p, func_code=%#x, flags=%#x", __func__, ccb, ccb->ccb_h.func_code, ccb->ccb_h.flags)); /* * If the user has supplied a stats structure, and if we understand * this particular type of ccb, record the transaction start. */ if (ds != NULL && (ccb->ccb_h.func_code == XPT_SCSI_IO || ccb->ccb_h.func_code == XPT_ATA_IO || ccb->ccb_h.func_code == XPT_NVME_IO)) { starttime = <ime; binuptime(starttime); devstat_start_transaction(ds, starttime); } /* * We must poll the I/O while we're dumping. The scheduler is normally * stopped for dumping, except when we call doadump from ddb. While the * scheduler is running in this case, we still need to poll the I/O to * avoid sleeping waiting for the ccb to complete. * * A panic triggered dump stops the scheduler, any callback from the * shutdown_post_sync event will run with the scheduler stopped, but * before we're officially dumping. To avoid hanging in adashutdown * initiated commands (or other similar situations), we have to test for * either SCHEDULER_STOPPED() here as well. * * To avoid locking problems, dumping/polling callers must call * without a periph lock held. */ must_poll = dumping || SCHEDULER_STOPPED(); ccb->ccb_h.cbfcnp = cam_periph_done; /* * If we're polling, then we need to ensure that we have ample resources * in the periph. cam_periph_error can reschedule the ccb by calling * xpt_action and returning ERESTART, so we have to effect the polling * in the do loop below. */ if (must_poll) { timeout = xpt_poll_setup(ccb); } if (timeout == 0) { ccb->ccb_h.status = CAM_RESRC_UNAVAIL; error = EBUSY; } else { xpt_action(ccb); do { if (must_poll) { xpt_pollwait(ccb, timeout); timeout = ccb->ccb_h.timeout * 10; } else { cam_periph_ccbwait(ccb); } if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) error = 0; else if (error_routine != NULL) { ccb->ccb_h.cbfcnp = cam_periph_done; error = (*error_routine)(ccb, camflags, sense_flags); } else error = 0; } while (error == ERESTART); } if ((ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { cam_release_devq(ccb->ccb_h.path, /* relsim_flags */0, /* openings */0, /* timeout */0, /* getcount_only */ FALSE); ccb->ccb_h.status &= ~CAM_DEV_QFRZN; } if (ds != NULL) { uint32_t bytes; devstat_tag_type tag; bool valid = true; if (ccb->ccb_h.func_code == XPT_SCSI_IO) { bytes = ccb->csio.dxfer_len - ccb->csio.resid; tag = (devstat_tag_type)(ccb->csio.tag_action & 0x3); } else if (ccb->ccb_h.func_code == XPT_ATA_IO) { bytes = ccb->ataio.dxfer_len - ccb->ataio.resid; tag = (devstat_tag_type)0; } else if (ccb->ccb_h.func_code == XPT_NVME_IO) { bytes = ccb->nvmeio.dxfer_len; /* NB: resid no possible */ tag = (devstat_tag_type)0; } else { valid = false; } if (valid) devstat_end_transaction(ds, bytes, tag, ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ? DEVSTAT_NO_DATA : (ccb->ccb_h.flags & CAM_DIR_OUT) ? DEVSTAT_WRITE : DEVSTAT_READ, NULL, starttime); } return(error); } void cam_freeze_devq(struct cam_path *path) { struct ccb_hdr ccb_h; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_freeze_devq\n")); xpt_setup_ccb(&ccb_h, path, /*priority*/1); ccb_h.func_code = XPT_NOOP; ccb_h.flags = CAM_DEV_QFREEZE; xpt_action((union ccb *)&ccb_h); } u_int32_t cam_release_devq(struct cam_path *path, u_int32_t relsim_flags, u_int32_t openings, u_int32_t arg, int getcount_only) { struct ccb_relsim crs; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_release_devq(%u, %u, %u, %d)\n", relsim_flags, openings, arg, getcount_only)); xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL); crs.ccb_h.func_code = XPT_REL_SIMQ; crs.ccb_h.flags = getcount_only ? CAM_DEV_QFREEZE : 0; crs.release_flags = relsim_flags; crs.openings = openings; crs.release_timeout = arg; xpt_action((union ccb *)&crs); return (crs.qfrozen_cnt); } #define saved_ccb_ptr ppriv_ptr0 static void camperiphdone(struct cam_periph *periph, union ccb *done_ccb) { union ccb *saved_ccb; cam_status status; struct scsi_start_stop_unit *scsi_cmd; int error = 0, error_code, sense_key, asc, ascq; scsi_cmd = (struct scsi_start_stop_unit *) &done_ccb->csio.cdb_io.cdb_bytes; status = done_ccb->ccb_h.status; if ((status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (scsi_extract_sense_ccb(done_ccb, &error_code, &sense_key, &asc, &ascq)) { /* * If the error is "invalid field in CDB", * and the load/eject flag is set, turn the * flag off and try again. This is just in * case the drive in question barfs on the * load eject flag. The CAM code should set * the load/eject flag by default for * removable media. */ if ((scsi_cmd->opcode == START_STOP_UNIT) && ((scsi_cmd->how & SSS_LOEJ) != 0) && (asc == 0x24) && (ascq == 0x00)) { scsi_cmd->how &= ~SSS_LOEJ; if (status & CAM_DEV_QFRZN) { cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0); done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN; } xpt_action(done_ccb); goto out; } } error = cam_periph_error(done_ccb, 0, SF_RETRY_UA | SF_NO_PRINT); if (error == ERESTART) goto out; if (done_ccb->ccb_h.status & CAM_DEV_QFRZN) { cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0); done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN; } } else { /* * If we have successfully taken a device from the not * ready to ready state, re-scan the device and re-get * the inquiry information. Many devices (mostly disks) * don't properly report their inquiry information unless * they are spun up. */ if (scsi_cmd->opcode == START_STOP_UNIT) xpt_async(AC_INQ_CHANGED, done_ccb->ccb_h.path, NULL); } /* If we tried long wait and still failed, remember that. */ if ((periph->flags & CAM_PERIPH_RECOVERY_WAIT) && (done_ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY)) { periph->flags &= ~CAM_PERIPH_RECOVERY_WAIT; if (error != 0 && done_ccb->ccb_h.retry_count == 0) periph->flags |= CAM_PERIPH_RECOVERY_WAIT_FAILED; } /* * After recovery action(s) completed, return to the original CCB. * If the recovery CCB has failed, considering its own possible * retries and recovery, assume we are back in state where we have * been originally, but without recovery hopes left. In such case, * after the final attempt below, we cancel any further retries, * blocking by that also any new recovery attempts for this CCB, * and the result will be the final one returned to the CCB owher. */ saved_ccb = (union ccb *)done_ccb->ccb_h.saved_ccb_ptr; bcopy(saved_ccb, done_ccb, sizeof(*done_ccb)); xpt_free_ccb(saved_ccb); if (done_ccb->ccb_h.cbfcnp != camperiphdone) periph->flags &= ~CAM_PERIPH_RECOVERY_INPROG; if (error != 0) done_ccb->ccb_h.retry_count = 0; xpt_action(done_ccb); out: /* Drop freeze taken due to CAM_DEV_QFREEZE flag set. */ cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0); } /* * Generic Async Event handler. Peripheral drivers usually * filter out the events that require personal attention, * and leave the rest to this function. */ void cam_periph_async(struct cam_periph *periph, u_int32_t code, struct cam_path *path, void *arg) { switch (code) { case AC_LOST_DEVICE: cam_periph_invalidate(periph); break; default: break; } } void cam_periph_bus_settle(struct cam_periph *periph, u_int bus_settle) { struct ccb_getdevstats cgds; xpt_setup_ccb(&cgds.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cgds.ccb_h.func_code = XPT_GDEV_STATS; xpt_action((union ccb *)&cgds); cam_periph_freeze_after_event(periph, &cgds.last_reset, bus_settle); } void cam_periph_freeze_after_event(struct cam_periph *periph, struct timeval* event_time, u_int duration_ms) { struct timeval delta; struct timeval duration_tv; if (!timevalisset(event_time)) return; microtime(&delta); timevalsub(&delta, event_time); duration_tv.tv_sec = duration_ms / 1000; duration_tv.tv_usec = (duration_ms % 1000) * 1000; if (timevalcmp(&delta, &duration_tv, <)) { timevalsub(&duration_tv, &delta); duration_ms = duration_tv.tv_sec * 1000; duration_ms += duration_tv.tv_usec / 1000; cam_freeze_devq(periph->path); cam_release_devq(periph->path, RELSIM_RELEASE_AFTER_TIMEOUT, /*reduction*/0, /*timeout*/duration_ms, /*getcount_only*/0); } } static int camperiphscsistatuserror(union ccb *ccb, union ccb **orig_ccb, cam_flags camflags, u_int32_t sense_flags, int *openings, u_int32_t *relsim_flags, u_int32_t *timeout, u_int32_t *action, const char **action_string) { struct cam_periph *periph; int error; switch (ccb->csio.scsi_status) { case SCSI_STATUS_OK: case SCSI_STATUS_COND_MET: case SCSI_STATUS_INTERMED: case SCSI_STATUS_INTERMED_COND_MET: error = 0; break; case SCSI_STATUS_CMD_TERMINATED: case SCSI_STATUS_CHECK_COND: error = camperiphscsisenseerror(ccb, orig_ccb, camflags, sense_flags, openings, relsim_flags, timeout, action, action_string); break; case SCSI_STATUS_QUEUE_FULL: { /* no decrement */ struct ccb_getdevstats cgds; /* * First off, find out what the current * transaction counts are. */ xpt_setup_ccb(&cgds.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgds.ccb_h.func_code = XPT_GDEV_STATS; xpt_action((union ccb *)&cgds); /* * If we were the only transaction active, treat * the QUEUE FULL as if it were a BUSY condition. */ if (cgds.dev_active != 0) { int total_openings; /* * Reduce the number of openings to * be 1 less than the amount it took * to get a queue full bounded by the * minimum allowed tag count for this * device. */ total_openings = cgds.dev_active + cgds.dev_openings; *openings = cgds.dev_active; if (*openings < cgds.mintags) *openings = cgds.mintags; if (*openings < total_openings) *relsim_flags = RELSIM_ADJUST_OPENINGS; else { /* * Some devices report queue full for * temporary resource shortages. For * this reason, we allow a minimum * tag count to be entered via a * quirk entry to prevent the queue * count on these devices from falling * to a pessimisticly low value. We * still wait for the next successful * completion, however, before queueing * more transactions to the device. */ *relsim_flags = RELSIM_RELEASE_AFTER_CMDCMPLT; } *timeout = 0; error = ERESTART; *action &= ~SSQ_PRINT_SENSE; break; } /* FALLTHROUGH */ } case SCSI_STATUS_BUSY: /* * Restart the queue after either another * command completes or a 1 second timeout. */ periph = xpt_path_periph(ccb->ccb_h.path); if (periph->flags & CAM_PERIPH_INVALID) { error = EIO; *action_string = "Periph was invalidated"; } else if ((sense_flags & SF_RETRY_BUSY) != 0 || ccb->ccb_h.retry_count > 0) { if ((sense_flags & SF_RETRY_BUSY) == 0) ccb->ccb_h.retry_count--; error = ERESTART; *relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT | RELSIM_RELEASE_AFTER_CMDCMPLT; *timeout = 1000; } else { error = EIO; *action_string = "Retries exhausted"; } break; case SCSI_STATUS_RESERV_CONFLICT: default: error = EIO; break; } return (error); } static int camperiphscsisenseerror(union ccb *ccb, union ccb **orig, cam_flags camflags, u_int32_t sense_flags, int *openings, u_int32_t *relsim_flags, u_int32_t *timeout, u_int32_t *action, const char **action_string) { struct cam_periph *periph; union ccb *orig_ccb = ccb; int error, recoveryccb; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) if (ccb->ccb_h.func_code == XPT_SCSI_IO && ccb->csio.bio != NULL) biotrack(ccb->csio.bio, __func__); #endif periph = xpt_path_periph(ccb->ccb_h.path); recoveryccb = (ccb->ccb_h.cbfcnp == camperiphdone); if ((periph->flags & CAM_PERIPH_RECOVERY_INPROG) && !recoveryccb) { /* * If error recovery is already in progress, don't attempt * to process this error, but requeue it unconditionally * and attempt to process it once error recovery has * completed. This failed command is probably related to * the error that caused the currently active error recovery * action so our current recovery efforts should also * address this command. Be aware that the error recovery * code assumes that only one recovery action is in progress * on a particular peripheral instance at any given time * (e.g. only one saved CCB for error recovery) so it is * imperitive that we don't violate this assumption. */ error = ERESTART; *action &= ~SSQ_PRINT_SENSE; } else { scsi_sense_action err_action; struct ccb_getdev cgd; /* * Grab the inquiry data for this device. */ xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); err_action = scsi_error_action(&ccb->csio, &cgd.inq_data, sense_flags); error = err_action & SS_ERRMASK; /* * Do not autostart sequential access devices * to avoid unexpected tape loading. */ if ((err_action & SS_MASK) == SS_START && SID_TYPE(&cgd.inq_data) == T_SEQUENTIAL) { *action_string = "Will not autostart a " "sequential access device"; goto sense_error_done; } /* * Avoid recovery recursion if recovery action is the same. */ if ((err_action & SS_MASK) >= SS_START && recoveryccb) { if (((err_action & SS_MASK) == SS_START && ccb->csio.cdb_io.cdb_bytes[0] == START_STOP_UNIT) || ((err_action & SS_MASK) == SS_TUR && (ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY))) { err_action = SS_RETRY|SSQ_DECREMENT_COUNT|EIO; *relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT; *timeout = 500; } } /* * If the recovery action will consume a retry, * make sure we actually have retries available. */ if ((err_action & SSQ_DECREMENT_COUNT) != 0) { if (ccb->ccb_h.retry_count > 0 && (periph->flags & CAM_PERIPH_INVALID) == 0) ccb->ccb_h.retry_count--; else { *action_string = "Retries exhausted"; goto sense_error_done; } } if ((err_action & SS_MASK) >= SS_START) { /* * Do common portions of commands that * use recovery CCBs. */ orig_ccb = xpt_alloc_ccb_nowait(); if (orig_ccb == NULL) { *action_string = "Can't allocate recovery CCB"; goto sense_error_done; } /* * Clear freeze flag for original request here, as * this freeze will be dropped as part of ERESTART. */ ccb->ccb_h.status &= ~CAM_DEV_QFRZN; bcopy(ccb, orig_ccb, sizeof(*orig_ccb)); } switch (err_action & SS_MASK) { case SS_NOP: *action_string = "No recovery action needed"; error = 0; break; case SS_RETRY: *action_string = "Retrying command (per sense data)"; error = ERESTART; break; case SS_FAIL: *action_string = "Unretryable error"; break; case SS_START: { int le; /* * Send a start unit command to the device, and * then retry the command. */ *action_string = "Attempting to start unit"; periph->flags |= CAM_PERIPH_RECOVERY_INPROG; /* * Check for removable media and set * load/eject flag appropriately. */ if (SID_IS_REMOVABLE(&cgd.inq_data)) le = TRUE; else le = FALSE; scsi_start_stop(&ccb->csio, /*retries*/1, camperiphdone, MSG_SIMPLE_Q_TAG, /*start*/TRUE, /*load/eject*/le, /*immediate*/FALSE, SSD_FULL_SIZE, /*timeout*/50000); break; } case SS_TUR: { /* * Send a Test Unit Ready to the device. * If the 'many' flag is set, we send 120 * test unit ready commands, one every half * second. Otherwise, we just send one TUR. * We only want to do this if the retry * count has not been exhausted. */ int retries; if ((err_action & SSQ_MANY) != 0 && (periph->flags & CAM_PERIPH_RECOVERY_WAIT_FAILED) == 0) { periph->flags |= CAM_PERIPH_RECOVERY_WAIT; *action_string = "Polling device for readiness"; retries = 120; } else { *action_string = "Testing device for readiness"; retries = 1; } periph->flags |= CAM_PERIPH_RECOVERY_INPROG; scsi_test_unit_ready(&ccb->csio, retries, camperiphdone, MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, /*timeout*/5000); /* * Accomplish our 500ms delay by deferring * the release of our device queue appropriately. */ *relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT; *timeout = 500; break; } default: panic("Unhandled error action %x", err_action); } if ((err_action & SS_MASK) >= SS_START) { /* * Drop the priority, so that the recovery * CCB is the first to execute. Freeze the queue * after this command is sent so that we can * restore the old csio and have it queued in * the proper order before we release normal * transactions to the device. */ ccb->ccb_h.pinfo.priority--; ccb->ccb_h.flags |= CAM_DEV_QFREEZE; ccb->ccb_h.saved_ccb_ptr = orig_ccb; error = ERESTART; *orig = orig_ccb; } sense_error_done: *action = err_action; } return (error); } /* * Generic error handler. Peripheral drivers usually filter * out the errors that they handle in a unique manner, then * call this function. */ int cam_periph_error(union ccb *ccb, cam_flags camflags, u_int32_t sense_flags) { struct cam_path *newpath; union ccb *orig_ccb, *scan_ccb; struct cam_periph *periph; const char *action_string; cam_status status; int frozen, error, openings, devctl_err; u_int32_t action, relsim_flags, timeout; action = SSQ_PRINT_SENSE; periph = xpt_path_periph(ccb->ccb_h.path); action_string = NULL; status = ccb->ccb_h.status; frozen = (status & CAM_DEV_QFRZN) != 0; status &= CAM_STATUS_MASK; devctl_err = openings = relsim_flags = timeout = 0; orig_ccb = ccb; /* Filter the errors that should be reported via devctl */ switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: case CAM_REQ_ABORTED: case CAM_REQ_CMP_ERR: case CAM_REQ_TERMIO: case CAM_UNREC_HBA_ERROR: case CAM_DATA_RUN_ERR: case CAM_SCSI_STATUS_ERROR: case CAM_ATA_STATUS_ERROR: case CAM_SMP_STATUS_ERROR: devctl_err++; break; default: break; } switch (status) { case CAM_REQ_CMP: error = 0; action &= ~SSQ_PRINT_SENSE; break; case CAM_SCSI_STATUS_ERROR: error = camperiphscsistatuserror(ccb, &orig_ccb, camflags, sense_flags, &openings, &relsim_flags, &timeout, &action, &action_string); break; case CAM_AUTOSENSE_FAIL: error = EIO; /* we have to kill the command */ break; case CAM_UA_ABORT: case CAM_UA_TERMIO: case CAM_MSG_REJECT_REC: /* XXX Don't know that these are correct */ error = EIO; break; case CAM_SEL_TIMEOUT: if ((camflags & CAM_RETRY_SELTO) != 0) { if (ccb->ccb_h.retry_count > 0 && (periph->flags & CAM_PERIPH_INVALID) == 0) { ccb->ccb_h.retry_count--; error = ERESTART; /* * Wait a bit to give the device * time to recover before we try again. */ relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT; timeout = periph_selto_delay; break; } action_string = "Retries exhausted"; } /* FALLTHROUGH */ case CAM_DEV_NOT_THERE: error = ENXIO; action = SSQ_LOST; break; case CAM_REQ_INVALID: case CAM_PATH_INVALID: case CAM_NO_HBA: case CAM_PROVIDE_FAIL: case CAM_REQ_TOO_BIG: case CAM_LUN_INVALID: case CAM_TID_INVALID: case CAM_FUNC_NOTAVAIL: error = EINVAL; break; case CAM_SCSI_BUS_RESET: case CAM_BDR_SENT: /* * Commands that repeatedly timeout and cause these * kinds of error recovery actions, should return * CAM_CMD_TIMEOUT, which allows us to safely assume * that this command was an innocent bystander to * these events and should be unconditionally * retried. */ case CAM_REQUEUE_REQ: /* Unconditional requeue if device is still there */ if (periph->flags & CAM_PERIPH_INVALID) { action_string = "Periph was invalidated"; error = EIO; } else if (sense_flags & SF_NO_RETRY) { error = EIO; action_string = "Retry was blocked"; } else { error = ERESTART; action &= ~SSQ_PRINT_SENSE; } break; case CAM_RESRC_UNAVAIL: /* Wait a bit for the resource shortage to abate. */ timeout = periph_noresrc_delay; /* FALLTHROUGH */ case CAM_BUSY: if (timeout == 0) { /* Wait a bit for the busy condition to abate. */ timeout = periph_busy_delay; } relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT; /* FALLTHROUGH */ case CAM_ATA_STATUS_ERROR: case CAM_REQ_CMP_ERR: case CAM_CMD_TIMEOUT: case CAM_UNEXP_BUSFREE: case CAM_UNCOR_PARITY: case CAM_DATA_RUN_ERR: default: if (periph->flags & CAM_PERIPH_INVALID) { error = EIO; action_string = "Periph was invalidated"; } else if (ccb->ccb_h.retry_count == 0) { error = EIO; action_string = "Retries exhausted"; } else if (sense_flags & SF_NO_RETRY) { error = EIO; action_string = "Retry was blocked"; } else { ccb->ccb_h.retry_count--; error = ERESTART; } break; } if ((sense_flags & SF_PRINT_ALWAYS) || CAM_DEBUGGED(ccb->ccb_h.path, CAM_DEBUG_INFO)) action |= SSQ_PRINT_SENSE; else if (sense_flags & SF_NO_PRINT) action &= ~SSQ_PRINT_SENSE; if ((action & SSQ_PRINT_SENSE) != 0) cam_error_print(orig_ccb, CAM_ESF_ALL, CAM_EPF_ALL); if (error != 0 && (action & SSQ_PRINT_SENSE) != 0) { if (error != ERESTART) { if (action_string == NULL) action_string = "Unretryable error"; xpt_print(ccb->ccb_h.path, "Error %d, %s\n", error, action_string); } else if (action_string != NULL) xpt_print(ccb->ccb_h.path, "%s\n", action_string); else { xpt_print(ccb->ccb_h.path, "Retrying command, %d more tries remain\n", ccb->ccb_h.retry_count); } } if (devctl_err && (error != 0 || (action & SSQ_PRINT_SENSE) != 0)) cam_periph_devctl_notify(orig_ccb); if ((action & SSQ_LOST) != 0) { lun_id_t lun_id; /* * For a selection timeout, we consider all of the LUNs on * the target to be gone. If the status is CAM_DEV_NOT_THERE, * then we only get rid of the device(s) specified by the * path in the original CCB. */ if (status == CAM_SEL_TIMEOUT) lun_id = CAM_LUN_WILDCARD; else lun_id = xpt_path_lun_id(ccb->ccb_h.path); /* Should we do more if we can't create the path?? */ if (xpt_create_path(&newpath, periph, xpt_path_path_id(ccb->ccb_h.path), xpt_path_target_id(ccb->ccb_h.path), lun_id) == CAM_REQ_CMP) { /* * Let peripheral drivers know that this * device has gone away. */ xpt_async(AC_LOST_DEVICE, newpath, NULL); xpt_free_path(newpath); } } /* Broadcast UNIT ATTENTIONs to all periphs. */ if ((action & SSQ_UA) != 0) xpt_async(AC_UNIT_ATTENTION, orig_ccb->ccb_h.path, orig_ccb); /* Rescan target on "Reported LUNs data has changed" */ if ((action & SSQ_RESCAN) != 0) { if (xpt_create_path(&newpath, NULL, xpt_path_path_id(ccb->ccb_h.path), xpt_path_target_id(ccb->ccb_h.path), CAM_LUN_WILDCARD) == CAM_REQ_CMP) { scan_ccb = xpt_alloc_ccb_nowait(); if (scan_ccb != NULL) { scan_ccb->ccb_h.path = newpath; scan_ccb->ccb_h.func_code = XPT_SCAN_TGT; scan_ccb->crcn.flags = 0; xpt_rescan(scan_ccb); } else { xpt_print(newpath, "Can't allocate CCB to rescan target\n"); xpt_free_path(newpath); } } } /* Attempt a retry */ if (error == ERESTART || error == 0) { if (frozen != 0) ccb->ccb_h.status &= ~CAM_DEV_QFRZN; if (error == ERESTART) xpt_action(ccb); if (frozen != 0) cam_release_devq(ccb->ccb_h.path, relsim_flags, openings, timeout, /*getcount_only*/0); } return (error); } #define CAM_PERIPH_DEVD_MSG_SIZE 256 static void cam_periph_devctl_notify(union ccb *ccb) { struct cam_periph *periph; struct ccb_getdev *cgd; struct sbuf sb; int serr, sk, asc, ascq; char *sbmsg, *type; sbmsg = malloc(CAM_PERIPH_DEVD_MSG_SIZE, M_CAMPERIPH, M_NOWAIT); if (sbmsg == NULL) return; sbuf_new(&sb, sbmsg, CAM_PERIPH_DEVD_MSG_SIZE, SBUF_FIXEDLEN); periph = xpt_path_periph(ccb->ccb_h.path); sbuf_printf(&sb, "device=%s%d ", periph->periph_name, periph->unit_number); sbuf_printf(&sb, "serial=\""); if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) { xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgd->ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)cgd); if (cgd->ccb_h.status == CAM_REQ_CMP) sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len); xpt_free_ccb((union ccb *)cgd); } sbuf_printf(&sb, "\" "); sbuf_printf(&sb, "cam_status=\"0x%x\" ", ccb->ccb_h.status); switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: sbuf_printf(&sb, "timeout=%d ", ccb->ccb_h.timeout); type = "timeout"; break; case CAM_SCSI_STATUS_ERROR: sbuf_printf(&sb, "scsi_status=%d ", ccb->csio.scsi_status); if (scsi_extract_sense_ccb(ccb, &serr, &sk, &asc, &ascq)) sbuf_printf(&sb, "scsi_sense=\"%02x %02x %02x %02x\" ", serr, sk, asc, ascq); type = "error"; break; case CAM_ATA_STATUS_ERROR: sbuf_printf(&sb, "RES=\""); ata_res_sbuf(&ccb->ataio.res, &sb); sbuf_printf(&sb, "\" "); type = "error"; break; default: type = "error"; break; } if (ccb->ccb_h.func_code == XPT_SCSI_IO) { sbuf_printf(&sb, "CDB=\""); scsi_cdb_sbuf(scsiio_cdb_ptr(&ccb->csio), &sb); sbuf_printf(&sb, "\" "); } else if (ccb->ccb_h.func_code == XPT_ATA_IO) { sbuf_printf(&sb, "ACB=\""); ata_cmd_sbuf(&ccb->ataio.cmd, &sb); sbuf_printf(&sb, "\" "); } if (sbuf_finish(&sb) == 0) devctl_notify("CAM", "periph", type, sbuf_data(&sb)); sbuf_delete(&sb); free(sbmsg, M_CAMPERIPH); } /* * Sysctl to force an invalidation of the drive right now. Can be * called with CTLFLAG_MPSAFE since we take periph lock. */ int cam_periph_invalidate_sysctl(SYSCTL_HANDLER_ARGS) { struct cam_periph *periph; int error, value; periph = arg1; value = 0; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL || value != 1) return (error); cam_periph_lock(periph); cam_periph_invalidate(periph); cam_periph_unlock(periph); return (0); } Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c =================================================================== --- head/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c (revision 364441) +++ head/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c (revision 364442) @@ -1,338 +1,338 @@ /*- * Copyright (c) 2010 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include -#include +#include #include #include #include #include struct sysevent { nvlist_t *se_nvl; char se_class[128]; char se_subclass[128]; char se_pub[128]; }; sysevent_t * sysevent_alloc(char *class, char *subclass, char *pub, int flag) { struct sysevent *ev; ASSERT(class != NULL); ASSERT(subclass != NULL); ASSERT(pub != NULL); ASSERT(flag == SE_SLEEP); ev = kmem_alloc(sizeof(*ev), KM_SLEEP); ev->se_nvl = NULL; strlcpy(ev->se_class, class, sizeof(ev->se_class)); strlcpy(ev->se_subclass, subclass, sizeof(ev->se_subclass)); strlcpy(ev->se_pub, pub, sizeof(ev->se_pub)); return ((sysevent_t *)ev); } void sysevent_free(sysevent_t *evp) { struct sysevent *ev = (struct sysevent *)evp; ASSERT(evp != NULL); if (ev->se_nvl != NULL) sysevent_free_attr(ev->se_nvl); kmem_free(ev, sizeof(*ev)); } int sysevent_add_attr(sysevent_attr_list_t **ev_attr_list, char *name, sysevent_value_t *se_value, int flag) { nvlist_t *nvl; int error; ASSERT(ev_attr_list != NULL); ASSERT(name != NULL); ASSERT(se_value != NULL); ASSERT(flag == SE_SLEEP); if (strlen(name) >= MAX_ATTR_NAME) return (SE_EINVAL); nvl = *ev_attr_list; if (nvl == NULL) { if (nvlist_alloc(&nvl, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0) return (SE_ENOMEM); } error = 0; switch (se_value->value_type) { case SE_DATA_TYPE_UINT64: error = nvlist_add_uint64(nvl, name, se_value->value.sv_uint64); break; case SE_DATA_TYPE_STRING: if (strlen(se_value->value.sv_string) >= MAX_STRING_SZ) error = SE_EINVAL; if (error == 0) { error = nvlist_add_string(nvl, name, se_value->value.sv_string); } break; default: #if 0 printf("%s: type %d is not implemented\n", __func__, se_value->value_type); #endif break; } if (error != 0) { nvlist_free(nvl); return (error); } *ev_attr_list = nvl; return (0); } void sysevent_free_attr(sysevent_attr_list_t *ev_attr_list) { nvlist_free(ev_attr_list); } int sysevent_attach_attributes(sysevent_t *evp, sysevent_attr_list_t *ev_attr_list) { struct sysevent *ev = (struct sysevent *)evp; ASSERT(ev->se_nvl == NULL); ev->se_nvl = ev_attr_list; return (0); } void sysevent_detach_attributes(sysevent_t *evp) { struct sysevent *ev = (struct sysevent *)evp; ASSERT(ev->se_nvl != NULL); ev->se_nvl = NULL; } int log_sysevent(sysevent_t *evp, int flag, sysevent_id_t *eid) { struct sysevent *ev = (struct sysevent *)evp; struct sbuf *sb; const char *type; char typestr[128]; nvpair_t *elem = NULL; ASSERT(evp != NULL); ASSERT(ev->se_nvl != NULL); ASSERT(flag == SE_SLEEP); ASSERT(eid != NULL); sb = sbuf_new_auto(); if (sb == NULL) return (SE_ENOMEM); type = NULL; while ((elem = nvlist_next_nvpair(ev->se_nvl, elem)) != NULL) { switch (nvpair_type(elem)) { case DATA_TYPE_BOOLEAN: { boolean_t value; (void) nvpair_value_boolean_value(elem, &value); sbuf_printf(sb, " %s=%s", nvpair_name(elem), value ? "true" : "false"); break; } case DATA_TYPE_UINT8: { uint8_t value; (void) nvpair_value_uint8(elem, &value); sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value); break; } case DATA_TYPE_INT32: { int32_t value; (void) nvpair_value_int32(elem, &value); sbuf_printf(sb, " %s=%jd", nvpair_name(elem), (intmax_t)value); break; } case DATA_TYPE_UINT32: { uint32_t value; (void) nvpair_value_uint32(elem, &value); sbuf_printf(sb, " %s=%ju", nvpair_name(elem), (uintmax_t)value); break; } case DATA_TYPE_INT64: { int64_t value; (void) nvpair_value_int64(elem, &value); sbuf_printf(sb, " %s=%jd", nvpair_name(elem), (intmax_t)value); break; } case DATA_TYPE_UINT64: { uint64_t value; (void) nvpair_value_uint64(elem, &value); sbuf_printf(sb, " %s=%ju", nvpair_name(elem), (uintmax_t)value); break; } case DATA_TYPE_STRING: { char *value; (void) nvpair_value_string(elem, &value); sbuf_printf(sb, " %s=%s", nvpair_name(elem), value); if (strcmp(FM_CLASS, nvpair_name(elem)) == 0) type = value; break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *value; uint_t ii, nelem; (void) nvpair_value_uint8_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%02hhx", value[ii]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *value; uint_t ii, nelem; (void) nvpair_value_uint16_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%04hx", value[ii]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *value; uint_t ii, nelem; (void) nvpair_value_uint32_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *value; uint_t ii, nelem; (void) nvpair_value_uint64_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]); break; } default: #if 0 printf("%s: type %d is not implemented\n", __func__, nvpair_type(elem)); #endif break; } } if (sbuf_finish(sb) != 0) { sbuf_delete(sb); return (SE_ENOMEM); } if (type == NULL) type = ev->se_subclass; if (strncmp(type, "ESC_ZFS_", 8) == 0) { snprintf(typestr, sizeof(typestr), "misc.fs.zfs.%s", type + 8); type = typestr; } devctl_notify("ZFS", "ZFS", type, sbuf_data(sb)); sbuf_delete(sb); return (0); } int _ddi_log_sysevent(char *vendor, char *class, char *subclass, nvlist_t *attr_list, sysevent_id_t *eidp, int flag) { sysevent_t *ev; int ret; ASSERT(vendor != NULL); ASSERT(class != NULL); ASSERT(subclass != NULL); ASSERT(attr_list != NULL); ASSERT(eidp != NULL); ASSERT(flag == DDI_SLEEP); ev = sysevent_alloc(class, subclass, vendor, SE_SLEEP); ASSERT(ev != NULL); (void)sysevent_attach_attributes(ev, attr_list); ret = log_sysevent(ev, SE_SLEEP, eidp); sysevent_detach_attributes(ev); sysevent_free(ev); return (ret); } Index: head/sys/geom/geom_dev.c =================================================================== --- head/sys/geom/geom_dev.c (revision 364441) +++ head/sys/geom/geom_dev.c (revision 364442) @@ -1,896 +1,896 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include struct g_dev_softc { struct mtx sc_mtx; struct cdev *sc_dev; struct cdev *sc_alias; int sc_open; u_int sc_active; #define SC_A_DESTROY (1 << 31) #define SC_A_OPEN (1 << 30) #define SC_A_ACTIVE (SC_A_OPEN - 1) }; static d_open_t g_dev_open; static d_close_t g_dev_close; static d_strategy_t g_dev_strategy; static d_ioctl_t g_dev_ioctl; static struct cdevsw g_dev_cdevsw = { .d_version = D_VERSION, .d_open = g_dev_open, .d_close = g_dev_close, .d_read = physread, .d_write = physwrite, .d_ioctl = g_dev_ioctl, .d_strategy = g_dev_strategy, .d_name = "g_dev", .d_flags = D_DISK | D_TRACKCLOSE, }; static g_init_t g_dev_init; static g_fini_t g_dev_fini; static g_taste_t g_dev_taste; static g_orphan_t g_dev_orphan; static g_attrchanged_t g_dev_attrchanged; static g_resize_t g_dev_resize; static struct g_class g_dev_class = { .name = "DEV", .version = G_VERSION, .init = g_dev_init, .fini = g_dev_fini, .taste = g_dev_taste, .orphan = g_dev_orphan, .attrchanged = g_dev_attrchanged, .resize = g_dev_resize }; /* * We target 262144 (8 x 32768) sectors by default as this significantly * increases the throughput on commonly used SSD's with a marginal * increase in non-interruptible request latency. */ static uint64_t g_dev_del_max_sectors = 262144; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_DEV stuff"); SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW, &g_dev_del_max_sectors, 0, "Maximum number of sectors in a single " "delete request sent to the provider. Larger requests are chunked " "so they can be interrupted. (0 = disable chunking)"); static char *dumpdev = NULL; static void g_dev_init(struct g_class *mp) { dumpdev = kern_getenv("dumpdev"); } static void g_dev_fini(struct g_class *mp) { freeenv(dumpdev); dumpdev = NULL; } static int g_dev_setdumpdev(struct cdev *dev, struct diocskerneldump_arg *kda) { struct g_kerneldump kd; struct g_consumer *cp; int error, len; MPASS(dev != NULL && kda != NULL); MPASS(kda->kda_index != KDA_REMOVE); cp = dev->si_drv2; len = sizeof(kd); memset(&kd, 0, len); kd.offset = 0; kd.length = OFF_MAX; error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd); if (error != 0) return (error); error = dumper_insert(&kd.di, devtoname(dev), kda); if (error == 0) dev->si_flags |= SI_DUMPDEV; return (error); } static int init_dumpdev(struct cdev *dev) { struct diocskerneldump_arg kda; struct g_consumer *cp; const char *devprefix = _PATH_DEV, *devname; int error; size_t len; bzero(&kda, sizeof(kda)); kda.kda_index = KDA_APPEND; if (dumpdev == NULL) return (0); len = strlen(devprefix); devname = devtoname(dev); if (strcmp(devname, dumpdev) != 0 && (strncmp(dumpdev, devprefix, len) != 0 || strcmp(devname, dumpdev + len) != 0)) return (0); cp = (struct g_consumer *)dev->si_drv2; error = g_access(cp, 1, 0, 0); if (error != 0) return (error); error = g_dev_setdumpdev(dev, &kda); if (error == 0) { freeenv(dumpdev); dumpdev = NULL; } (void)g_access(cp, -1, 0, 0); return (error); } static void g_dev_destroy(void *arg, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_dev_softc *sc; char buf[SPECNAMELEN + 6]; g_topology_assert(); cp = arg; gp = cp->geom; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify_f("GEOM", "DEV", "DESTROY", buf, M_WAITOK); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); } void g_dev_print(void) { struct g_geom *gp; char const *p = ""; LIST_FOREACH(gp, &g_dev_class.geom, geom) { printf("%s%s", p, gp->name); p = " "; } printf("\n"); } static void g_dev_set_physpath(struct g_consumer *cp) { struct g_dev_softc *sc; char *physpath; int error, physpath_len; if (g_access(cp, 1, 0, 0) != 0) return; sc = cp->private; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); g_access(cp, -1, 0, 0); if (error == 0 && strlen(physpath) != 0) { struct cdev *dev, *old_alias_dev; struct cdev **alias_devp; dev = sc->sc_dev; old_alias_dev = sc->sc_alias; alias_devp = (struct cdev **)&sc->sc_alias; make_dev_physpath_alias(MAKEDEV_WAITOK, alias_devp, dev, old_alias_dev, physpath); } else if (sc->sc_alias) { destroy_dev((struct cdev *)sc->sc_alias); sc->sc_alias = NULL; } g_free(physpath); } static void g_dev_set_media(struct g_consumer *cp) { struct g_dev_softc *sc; struct cdev *dev; char buf[SPECNAMELEN + 6]; sc = cp->private; dev = sc->sc_dev; snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK); devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK); dev = sc->sc_alias; if (dev != NULL) { snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK); devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK); } } static void g_dev_attrchanged(struct g_consumer *cp, const char *attr) { if (strcmp(attr, "GEOM::media") == 0) { g_dev_set_media(cp); return; } if (strcmp(attr, "GEOM::physpath") == 0) { g_dev_set_physpath(cp); return; } } static void g_dev_resize(struct g_consumer *cp) { char buf[SPECNAMELEN + 6]; snprintf(buf, sizeof(buf), "cdev=%s", cp->provider->name); devctl_notify_f("GEOM", "DEV", "SIZECHANGE", buf, M_WAITOK); } struct g_provider * g_dev_getprovider(struct cdev *dev) { struct g_consumer *cp; g_topology_assert(); if (dev == NULL) return (NULL); if (dev->si_devsw != &g_dev_cdevsw) return (NULL); cp = dev->si_drv2; return (cp->provider); } static struct g_geom * g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_geom *gp; struct g_geom_alias *gap; struct g_consumer *cp; struct g_dev_softc *sc; int error; struct cdev *dev, *adev; char buf[SPECNAMELEN + 6]; struct make_dev_args args; g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); gp = g_new_geomf(mp, "%s", pp->name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF); cp = g_new_consumer(gp); cp->private = sc; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); KASSERT(error == 0, ("g_dev_taste(%s) failed to g_attach, err=%d", pp->name, error)); make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &g_dev_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv1 = sc; args.mda_si_drv2 = cp; error = make_dev_s(&args, &sc->sc_dev, "%s", gp->name); if (error != 0) { printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); return (NULL); } dev = sc->sc_dev; dev->si_flags |= SI_UNMAPPED; dev->si_iosize_max = MAXPHYS; error = init_dumpdev(dev); if (error != 0) printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_dev_attrchanged(cp, "GEOM::physpath"); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify_f("GEOM", "DEV", "CREATE", buf, M_WAITOK); /* * Now add all the aliases for this drive */ LIST_FOREACH(gap, &pp->aliases, ga_next) { error = make_dev_alias_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &adev, dev, "%s", gap->ga_alias); if (error) { printf("%s: make_dev_alias_p() failed (name=%s, error=%d)\n", __func__, gap->ga_alias, error); continue; } snprintf(buf, sizeof(buf), "cdev=%s", gap->ga_alias); devctl_notify_f("GEOM", "DEV", "CREATE", buf, M_WAITOK); } return (gp); } static int g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? 1 : 0; w = flags & FWRITE ? 1 : 0; #ifdef notyet e = flags & O_EXCL ? 1 : 0; #else e = 0; #endif /* * This happens on attempt to open a device node with O_EXEC. */ if (r + w + e == 0) return (EINVAL); if (w) { /* * When running in very secure mode, do not allow * opens for writing of any disks. */ error = securelevel_ge(td->td_ucred, 2); if (error) return (error); } g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); if (error == 0) { sc = dev->si_drv1; mtx_lock(&sc->sc_mtx); if (sc->sc_open == 0 && (sc->sc_active & SC_A_ACTIVE) != 0) wakeup(&sc->sc_active); sc->sc_open += r + w + e; if (sc->sc_open == 0) atomic_clear_int(&sc->sc_active, SC_A_OPEN); else atomic_set_int(&sc->sc_active, SC_A_OPEN); mtx_unlock(&sc->sc_mtx); } return (error); } static int g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? -1 : 0; w = flags & FWRITE ? -1 : 0; #ifdef notyet e = flags & O_EXCL ? -1 : 0; #else e = 0; #endif /* * The vgonel(9) - caused by eg. forced unmount of devfs - calls * VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags, * which would result in zero deltas, which in turn would cause * panic in g_access(9). * * Note that we cannot zero the counters (ie. do "r = cp->acr" * etc) instead, because the consumer might be opened in another * devfs instance. */ if (r + w + e == 0) return (EINVAL); sc = dev->si_drv1; mtx_lock(&sc->sc_mtx); sc->sc_open += r + w + e; if (sc->sc_open == 0) atomic_clear_int(&sc->sc_active, SC_A_OPEN); else atomic_set_int(&sc->sc_active, SC_A_OPEN); while (sc->sc_open == 0 && (sc->sc_active & SC_A_ACTIVE) != 0) msleep(&sc->sc_active, &sc->sc_mtx, 0, "g_dev_close", hz / 10); mtx_unlock(&sc->sc_mtx); g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); return (error); } static int g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_provider *pp; off_t offset, length, chunk, odd; int i, error; #ifdef COMPAT_FREEBSD12 struct diocskerneldump_arg kda_copy; #endif cp = dev->si_drv2; pp = cp->provider; /* If consumer or provider is dying, don't disturb. */ if (cp->flags & G_CF_ORPHAN) return (ENXIO); if (pp->error) return (pp->error); error = 0; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_ioctl")); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = pp->sectorsize; if (*(u_int *)data == 0) error = ENOENT; break; case DIOCGMEDIASIZE: *(off_t *)data = pp->mediasize; if (*(off_t *)data == 0) error = ENOENT; break; case DIOCGFWSECTORS: error = g_io_getattr("GEOM::fwsectors", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFWHEADS: error = g_io_getattr("GEOM::fwheads", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFRONTSTUFF: error = g_io_getattr("GEOM::frontstuff", cp, &i, data); break; #ifdef COMPAT_FREEBSD11 case DIOCSKERNELDUMP_FREEBSD11: { struct diocskerneldump_arg kda; gone_in(13, "FreeBSD 11.x ABI compat"); bzero(&kda, sizeof(kda)); kda.kda_encryption = KERNELDUMP_ENC_NONE; kda.kda_index = (*(u_int *)data ? 0 : KDA_REMOVE_ALL); if (kda.kda_index == KDA_REMOVE_ALL) error = dumper_remove(devtoname(dev), &kda); else error = g_dev_setdumpdev(dev, &kda); break; } #endif #ifdef COMPAT_FREEBSD12 case DIOCSKERNELDUMP_FREEBSD12: { struct diocskerneldump_arg_freebsd12 *kda12; gone_in(14, "FreeBSD 12.x ABI compat"); kda12 = (void *)data; memcpy(&kda_copy, kda12, sizeof(kda_copy)); kda_copy.kda_index = (kda12->kda12_enable ? 0 : KDA_REMOVE_ALL); explicit_bzero(kda12, sizeof(*kda12)); /* Kludge to pass kda_copy to kda in fallthrough. */ data = (void *)&kda_copy; } /* FALLTHROUGH */ #endif case DIOCSKERNELDUMP: { struct diocskerneldump_arg *kda; uint8_t *encryptedkey; kda = (struct diocskerneldump_arg *)data; if (kda->kda_index == KDA_REMOVE_ALL || kda->kda_index == KDA_REMOVE_DEV || kda->kda_index == KDA_REMOVE) { error = dumper_remove(devtoname(dev), kda); explicit_bzero(kda, sizeof(*kda)); break; } if (kda->kda_encryption != KERNELDUMP_ENC_NONE) { if (kda->kda_encryptedkeysize == 0 || kda->kda_encryptedkeysize > KERNELDUMP_ENCKEY_MAX_SIZE) { explicit_bzero(kda, sizeof(*kda)); return (EINVAL); } encryptedkey = malloc(kda->kda_encryptedkeysize, M_TEMP, M_WAITOK); error = copyin(kda->kda_encryptedkey, encryptedkey, kda->kda_encryptedkeysize); } else { encryptedkey = NULL; } if (error == 0) { kda->kda_encryptedkey = encryptedkey; error = g_dev_setdumpdev(dev, kda); } zfree(encryptedkey, M_TEMP); explicit_bzero(kda, sizeof(*kda)); break; } case DIOCGFLUSH: error = g_io_flush(cp); break; case DIOCGDELETE: offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % pp->sectorsize) != 0 || (length % pp->sectorsize) != 0 || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = EINVAL; break; } if ((pp->mediasize > 0) && (offset >= pp->mediasize)) { /* * Catch out-of-bounds requests here. The problem is * that due to historical GEOM I/O implementation * peculatities, g_delete_data() would always return * success for requests starting just the next byte * after providers media boundary. Condition check on * non-zero media size, since that condition would * (most likely) cause ENXIO instead. */ error = EIO; break; } while (length > 0) { chunk = length; if (g_dev_del_max_sectors != 0 && chunk > g_dev_del_max_sectors * pp->sectorsize) { chunk = g_dev_del_max_sectors * pp->sectorsize; if (pp->stripesize > 0) { odd = (offset + chunk + pp->stripeoffset) % pp->stripesize; if (chunk > odd) chunk -= odd; } } error = g_delete_data(cp, offset, chunk); length -= chunk; offset += chunk; if (error) break; /* * Since the request size can be large, the service * time can be is likewise. We make this ioctl * interruptible by checking for signals for each bio. */ if (SIGPENDING(td)) break; } break; case DIOCGIDENT: error = g_io_getattr("GEOM::ident", cp, &i, data); break; case DIOCGPROVIDERNAME: strlcpy(data, pp->name, i); break; case DIOCGSTRIPESIZE: *(off_t *)data = pp->stripesize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = pp->stripeoffset; break; case DIOCGPHYSPATH: error = g_io_getattr("GEOM::physpath", cp, &i, data); if (error == 0 && *(char *)data == '\0') error = ENOENT; break; case DIOCGATTR: { struct diocgattr_arg *arg = (struct diocgattr_arg *)data; if (arg->len > sizeof(arg->value)) { error = EINVAL; break; } error = g_io_getattr(arg->name, cp, &arg->len, &arg->value); break; } case DIOCZONECMD: { struct disk_zone_args *zone_args =(struct disk_zone_args *)data; struct disk_zone_rep_entry *new_entries, *old_entries; struct disk_zone_report *rep; size_t alloc_size; old_entries = NULL; new_entries = NULL; rep = NULL; alloc_size = 0; if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) { rep = &zone_args->zone_params.report; #define MAXENTRIES (MAXPHYS / sizeof(struct disk_zone_rep_entry)) if (rep->entries_allocated > MAXENTRIES) rep->entries_allocated = MAXENTRIES; alloc_size = rep->entries_allocated * sizeof(struct disk_zone_rep_entry); if (alloc_size != 0) new_entries = g_malloc(alloc_size, M_WAITOK| M_ZERO); old_entries = rep->entries; rep->entries = new_entries; } error = g_io_zonecmd(zone_args, cp); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES && alloc_size != 0 && error == 0) error = copyout(new_entries, old_entries, alloc_size); if (old_entries != NULL && rep != NULL) rep->entries = old_entries; if (new_entries != NULL) g_free(new_entries); break; } default: if (pp->geom->ioctl != NULL) { error = pp->geom->ioctl(pp, cmd, data, fflag, td); } else { error = ENOIOCTL; } } return (error); } static void g_dev_done(struct bio *bp2) { struct g_consumer *cp; struct g_dev_softc *sc; struct bio *bp; int active; cp = bp2->bio_from; sc = cp->private; bp = bp2->bio_parent; bp->bio_error = bp2->bio_error; bp->bio_completed = bp2->bio_completed; bp->bio_resid = bp->bio_length - bp2->bio_completed; if (bp2->bio_cmd == BIO_ZONE) bcopy(&bp2->bio_zone, &bp->bio_zone, sizeof(bp->bio_zone)); if (bp2->bio_error != 0) { g_trace(G_T_BIO, "g_dev_done(%p) had error %d", bp2, bp2->bio_error); bp->bio_flags |= BIO_ERROR; } else { g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd", bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed); } g_destroy_bio(bp2); active = atomic_fetchadd_int(&sc->sc_active, -1) - 1; if ((active & SC_A_ACTIVE) == 0) { if ((active & SC_A_OPEN) == 0) wakeup(&sc->sc_active); if (active & SC_A_DESTROY) g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL); } biodone(bp); } static void g_dev_strategy(struct bio *bp) { struct g_consumer *cp; struct bio *bp2; struct cdev *dev; struct g_dev_softc *sc; KASSERT(bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE || bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_ZONE, ("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd)); dev = bp->bio_dev; cp = dev->si_drv2; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_strategy")); biotrack(bp, __func__); #ifdef INVARIANTS if ((bp->bio_offset % cp->provider->sectorsize) != 0 || (bp->bio_bcount % cp->provider->sectorsize) != 0) { bp->bio_resid = bp->bio_bcount; biofinish(bp, NULL, EINVAL); return; } #endif sc = dev->si_drv1; KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy")); atomic_add_int(&sc->sc_active, 1); for (;;) { /* * XXX: This is not an ideal solution, but I believe it to * XXX: deadlock safely, all things considered. */ bp2 = g_clone_bio(bp); if (bp2 != NULL) break; pause("gdstrat", hz / 10); } KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place")); bp2->bio_done = g_dev_done; g_trace(G_T_BIO, "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d", bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length, bp2->bio_data, bp2->bio_cmd); g_io_request(bp2, cp); KASSERT(cp->acr || cp->acw, ("g_dev_strategy raced with g_dev_close and lost")); } /* * g_dev_callback() * * Called by devfs when asynchronous device destruction is completed. * - Mark that we have no attached device any more. * - If there are no outstanding requests, schedule geom destruction. * Otherwise destruction will be scheduled later by g_dev_done(). */ static void g_dev_callback(void *arg) { struct g_consumer *cp; struct g_dev_softc *sc; int active; cp = arg; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name); sc->sc_dev = NULL; sc->sc_alias = NULL; active = atomic_fetchadd_int(&sc->sc_active, SC_A_DESTROY); if ((active & SC_A_ACTIVE) == 0) g_post_event(g_dev_destroy, cp, M_WAITOK, NULL); } /* * g_dev_orphan() * * Called from below when the provider orphaned us. * - Clear any dump settings. * - Request asynchronous device destruction to prevent any more requests * from coming in. The provider is already marked with an error, so * anything which comes in the interim will be returned immediately. */ static void g_dev_orphan(struct g_consumer *cp) { struct cdev *dev; struct g_dev_softc *sc; g_topology_assert(); sc = cp->private; dev = sc->sc_dev; g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name); /* Reset any dump-area set on this device */ if (dev->si_flags & SI_DUMPDEV) { struct diocskerneldump_arg kda; bzero(&kda, sizeof(kda)); kda.kda_index = KDA_REMOVE_DEV; (void)dumper_remove(devtoname(dev), &kda); } /* Destroy the struct cdev *so we get no more requests */ delist_dev(dev); destroy_dev_sched_cb(dev, g_dev_callback, cp); } DECLARE_GEOM_CLASS(g_dev_class, g_dev); Index: head/sys/geom/geom_disk.c =================================================================== --- head/sys/geom/geom_disk.c (revision 364441) +++ head/sys/geom/geom_disk.c (revision 364442) @@ -1,1087 +1,1087 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include -#include #include +#include #include #include #include #include #include #include #include #include #include #include #include struct g_disk_softc { struct disk *dp; struct devstat *d_devstat; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; char led[64]; uint32_t state; struct mtx done_mtx; }; static g_access_t g_disk_access; static g_start_t g_disk_start; static g_ioctl_t g_disk_ioctl; static g_dumpconf_t g_disk_dumpconf; static g_provgone_t g_disk_providergone; static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS); static struct g_class g_disk_class = { .name = G_DISK_CLASS_NAME, .version = G_VERSION, .start = g_disk_start, .access = g_disk_access, .ioctl = g_disk_ioctl, .providergone = g_disk_providergone, .dumpconf = g_disk_dumpconf, }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_DISK stuff"); DECLARE_GEOM_CLASS(g_disk_class, g_disk); static int g_disk_access(struct g_provider *pp, int r, int w, int e) { struct disk *dp; struct g_disk_softc *sc; int error; g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)", pp->name, r, w, e); g_topology_assert(); sc = pp->private; if ((dp = sc->dp) == NULL || dp->d_destroyed) { /* * Allow decreasing access count even if disk is not * available anymore. */ if (r <= 0 && w <= 0 && e <= 0) return (0); return (ENXIO); } r += pp->acr; w += pp->acw; e += pp->ace; error = 0; if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { /* * It would be better to defer this decision to d_open if * it was able to take flags. */ if (w > 0 && (dp->d_flags & DISKFLAG_WRITE_PROTECT) != 0) error = EROFS; if (error == 0 && dp->d_open != NULL) error = dp->d_open(dp); if (bootverbose && error != 0) printf("Opened disk %s -> %d\n", pp->name, error); if (error != 0) return (error); pp->sectorsize = dp->d_sectorsize; if (dp->d_maxsize == 0) { printf("WARNING: Disk drive %s%d has no d_maxsize\n", dp->d_name, dp->d_unit); dp->d_maxsize = DFLTPHYS; } if (dp->d_delmaxsize == 0) { if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) { printf("WARNING: Disk drive %s%d has no " "d_delmaxsize\n", dp->d_name, dp->d_unit); } dp->d_delmaxsize = dp->d_maxsize; } pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; dp->d_flags |= DISKFLAG_OPEN; /* * Do not invoke resize event when initial size was zero. * Some disks report its size only after first opening. */ if (pp->mediasize == 0) pp->mediasize = dp->d_mediasize; else g_resize_provider(pp, dp->d_mediasize); } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { if (dp->d_close != NULL) { error = dp->d_close(dp); if (error != 0) printf("Closed disk %s -> %d\n", pp->name, error); } sc->state = G_STATE_ACTIVE; if (sc->led[0] != 0) led_set(sc->led, "0"); dp->d_flags &= ~DISKFLAG_OPEN; } return (error); } static void g_disk_kerneldump(struct bio *bp, struct disk *dp) { struct g_kerneldump *gkd; struct g_geom *gp; gkd = (struct g_kerneldump*)bp->bio_data; gp = bp->bio_to->geom; g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)", gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); if (dp->d_dump == NULL) { g_io_deliver(bp, ENODEV); return; } gkd->di.dumper = dp->d_dump; gkd->di.priv = dp; gkd->di.blocksize = dp->d_sectorsize; gkd->di.maxiosize = dp->d_maxsize; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > dp->d_mediasize) gkd->length = dp->d_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_disk_setstate(struct bio *bp, struct g_disk_softc *sc) { const char *cmd; memcpy(&sc->state, bp->bio_data, sizeof(sc->state)); if (sc->led[0] != 0) { switch (sc->state) { case G_STATE_FAILED: cmd = "1"; break; case G_STATE_REBUILD: cmd = "f5"; break; case G_STATE_RESYNC: cmd = "f1"; break; default: cmd = "0"; break; } led_set(sc->led, cmd); } g_io_deliver(bp, 0); } static void g_disk_done(struct bio *bp) { struct bintime now; struct bio *bp2; struct g_disk_softc *sc; /* See "notes" for why we need a mutex here */ sc = bp->bio_caller1; bp2 = bp->bio_parent; binuptime(&now); mtx_lock(&sc->done_mtx); if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_length - bp->bio_resid; switch (bp->bio_cmd) { case BIO_ZONE: bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /*FALLTHROUGH*/ case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: devstat_end_transaction_bio_bt(sc->d_devstat, bp, &now); break; default: break; } bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) { mtx_unlock(&sc->done_mtx); bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed; g_io_deliver(bp2, bp2->bio_error); } else mtx_unlock(&sc->done_mtx); g_destroy_bio(bp); } static int g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td) { struct disk *dp; struct g_disk_softc *sc; sc = pp->private; dp = sc->dp; KASSERT(dp != NULL && !dp->d_destroyed, ("g_disk_ioctl(%lx) on destroyed disk %s", cmd, pp->name)); if (dp->d_ioctl == NULL) return (ENOIOCTL); return (dp->d_ioctl(dp, cmd, data, fflag, td)); } static off_t g_disk_maxsize(struct disk *dp, struct bio *bp) { if (bp->bio_cmd == BIO_DELETE) return (dp->d_delmaxsize); return (dp->d_maxsize); } static int g_disk_maxsegs(struct disk *dp, struct bio *bp) { return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1); } static void g_disk_advance(struct disk *dp, struct bio *bp, off_t off) { bp->bio_offset += off; bp->bio_length -= off; if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *seg, *end; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; off += bp->bio_ma_offset; while (off >= seg->ds_len) { KASSERT((seg != end), ("vlist request runs off the end")); off -= seg->ds_len; seg++; } bp->bio_ma_offset = off; bp->bio_ma_n = end - seg; bp->bio_data = (void *)seg; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma += off / PAGE_SIZE; bp->bio_ma_offset += off; bp->bio_ma_offset %= PAGE_SIZE; bp->bio_ma_n -= off / PAGE_SIZE; } else { bp->bio_data += off; } } static void g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset, off_t *plength, int *ppages) { uintptr_t seg_page_base; uintptr_t seg_page_end; off_t offset; off_t length; int seg_pages; offset = *poffset; length = *plength; if (length > seg->ds_len - offset) length = seg->ds_len - offset; seg_page_base = trunc_page(seg->ds_addr + offset); seg_page_end = round_page(seg->ds_addr + offset + length); seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT; if (seg_pages > *ppages) { seg_pages = *ppages; length = (seg_page_base + (seg_pages << PAGE_SHIFT)) - (seg->ds_addr + offset); } *poffset = 0; *plength -= length; *ppages -= seg_pages; } static off_t g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg) { bus_dma_segment_t *seg, *end; off_t residual; off_t offset; int pages; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; residual = bp->bio_length; offset = bp->bio_ma_offset; pages = g_disk_maxsegs(dp, bp); while (residual != 0 && pages != 0) { KASSERT((seg != end), ("vlist limit runs off the end")); g_disk_seg_limit(seg, &offset, &residual, &pages); seg++; } if (pendseg != NULL) *pendseg = seg; return (residual); } static bool g_disk_limit(struct disk *dp, struct bio *bp) { bool limited = false; off_t maxsz; maxsz = g_disk_maxsize(dp, bp); /* * XXX: If we have a stripesize we should really use it here. * Care should be taken in the delete case if this is done * as deletes can be very sensitive to size given how they * are processed. */ if (bp->bio_length > maxsz) { bp->bio_length = maxsz; limited = true; } if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *firstseg, *endseg; off_t residual; firstseg = (bus_dma_segment_t*)bp->bio_data; residual = g_disk_vlist_limit(dp, bp, &endseg); if (residual != 0) { bp->bio_ma_n = endseg - firstseg; bp->bio_length -= residual; limited = true; } } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE); } return (limited); } static void g_disk_start(struct bio *bp) { struct bio *bp2, *bp3; struct disk *dp; struct g_disk_softc *sc; int error; off_t off; biotrack(bp, __func__); sc = bp->bio_to->private; dp = sc->dp; KASSERT(dp != NULL && !dp->d_destroyed, ("g_disk_start(%p) on destroyed disk %s", bp, bp->bio_to->name)); error = EJUSTRETURN; switch(bp->bio_cmd) { case BIO_DELETE: if (!(dp->d_flags & DISKFLAG_CANDELETE)) { error = EOPNOTSUPP; break; } /* fall-through */ case BIO_READ: case BIO_WRITE: KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0, ("unmapped bio not supported by disk %s", dp->d_name)); off = 0; bp3 = NULL; bp2 = g_clone_bio(bp); if (bp2 == NULL) { error = ENOMEM; break; } for (;;) { if (g_disk_limit(dp, bp2)) { off += bp2->bio_length; /* * To avoid a race, we need to grab the next bio * before we schedule this one. See "notes". */ bp3 = g_clone_bio(bp); if (bp3 == NULL) bp->bio_error = ENOMEM; } bp2->bio_done = g_disk_done; bp2->bio_caller1 = sc; bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize; bp2->bio_bcount = bp2->bio_length; bp2->bio_disk = dp; devstat_start_transaction_bio(dp->d_devstat, bp2); dp->d_strategy(bp2); if (bp3 == NULL) break; bp2 = bp3; bp3 = NULL; g_disk_advance(dp, bp2, off); } break; case BIO_GETATTR: /* Give the driver a chance to override */ if (dp->d_getattr != NULL) { if (bp->bio_disk == NULL) bp->bio_disk = dp; error = dp->d_getattr(bp); if (error != -1) break; error = EJUSTRETURN; } if (g_handleattr_int(bp, "GEOM::candelete", (dp->d_flags & DISKFLAG_CANDELETE) != 0)) break; else if (g_handleattr_int(bp, "GEOM::fwsectors", dp->d_fwsectors)) break; else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads)) break; else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0)) break; else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident)) break; else if (g_handleattr_str(bp, "GEOM::descr", dp->d_descr)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor", dp->d_hba_vendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_device", dp->d_hba_device)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor", dp->d_hba_subvendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice", dp->d_hba_subdevice)) break; else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_disk_kerneldump(bp, dp); else if (!strcmp(bp->bio_attribute, "GEOM::setstate")) g_disk_setstate(bp, sc); else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate", dp->d_rotation_rate)) break; else if (g_handleattr_str(bp, "GEOM::attachment", dp->d_attachment)) break; else error = ENOIOCTL; break; case BIO_FLUSH: g_trace(G_T_BIO, "g_disk_flushcache(%s)", bp->bio_to->name); if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) { error = EOPNOTSUPP; break; } /*FALLTHROUGH*/ case BIO_ZONE: if (bp->bio_cmd == BIO_ZONE) { if (!(dp->d_flags & DISKFLAG_CANZONE)) { error = EOPNOTSUPP; break; } g_trace(G_T_BIO, "g_disk_zone(%s)", bp->bio_to->name); } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_disk_done; bp2->bio_caller1 = sc; bp2->bio_disk = dp; devstat_start_transaction_bio(dp->d_devstat, bp2); dp->d_strategy(bp2); break; case BIO_SPEEDUP: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_disk_done; bp2->bio_caller1 = sc; bp2->bio_disk = dp; dp->d_strategy(bp2); break; default: error = EOPNOTSUPP; break; } if (error != EJUSTRETURN) g_io_deliver(bp, error); return; } static void g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct bio *bp; struct disk *dp; struct g_disk_softc *sc; char *buf; int res = 0; sc = gp->softc; if (sc == NULL || (dp = sc->dp) == NULL) return; if (indent == NULL) { sbuf_printf(sb, " hd %u", dp->d_fwheads); sbuf_printf(sb, " sc %u", dp->d_fwsectors); return; } if (pp != NULL) { sbuf_printf(sb, "%s%u\n", indent, dp->d_fwheads); sbuf_printf(sb, "%s%u\n", indent, dp->d_fwsectors); /* * "rotationrate" is a little complicated, because the value * returned by the drive might not be the RPM; 0 and 1 are * special cases, and there's also a valid range. */ sbuf_printf(sb, "%s", indent); if (dp->d_rotation_rate == DISK_RR_UNKNOWN) /* Old drives */ sbuf_cat(sb, "unknown"); /* don't report RPM. */ else if (dp->d_rotation_rate == DISK_RR_NON_ROTATING) sbuf_cat(sb, "0"); else if ((dp->d_rotation_rate >= DISK_RR_MIN) && (dp->d_rotation_rate <= DISK_RR_MAX)) sbuf_printf(sb, "%u", dp->d_rotation_rate); else sbuf_cat(sb, "invalid"); sbuf_cat(sb, "\n"); if (dp->d_getattr != NULL) { buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK); bp = g_alloc_bio(); bp->bio_disk = dp; bp->bio_attribute = "GEOM::ident"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; res = dp->d_getattr(bp); sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, res == 0 ? buf : dp->d_ident); sbuf_cat(sb, "\n"); bp->bio_attribute = "GEOM::lunid"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, buf); sbuf_cat(sb, "\n"); } bp->bio_attribute = "GEOM::lunname"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, buf); sbuf_cat(sb, "\n"); } g_destroy_bio(bp); g_free(buf); } else { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, dp->d_ident); sbuf_cat(sb, "\n"); } sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, dp->d_descr); sbuf_cat(sb, "\n"); } } static void g_disk_resize(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_provider *pp; if (flag == EV_CANCEL) return; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (dp->d_destroyed || gp == NULL) return; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->sectorsize != 0 && pp->sectorsize != dp->d_sectorsize) g_wither_provider(pp, ENXIO); else g_resize_provider(pp, dp->d_mediasize); } } static void g_disk_create(void *arg, int flag) { struct g_geom *gp; struct g_provider *pp; struct disk *dp; struct g_disk_softc *sc; struct disk_alias *dap; char tmpstr[80]; if (flag == EV_CANCEL) return; g_topology_assert(); dp = arg; mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_START; /* * If the disk has already gone away, we can just stop here and * call the user's callback to tell him we've cleaned things up. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); if (dp->d_gone != NULL) dp->d_gone(dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF); sc->dp = dp; sc->d_devstat = dp->d_devstat; gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit); gp->softc = sc; pp = g_new_providerf(gp, "%s", gp->name); LIST_FOREACH(dap, &dp->d_aliases, da_next) g_provider_add_alias(pp, "%s%d", dap->da_alias, dp->d_unit); devstat_remove_entry(pp->stat); pp->stat = NULL; dp->d_devstat->id = pp; pp->mediasize = dp->d_mediasize; pp->sectorsize = dp->d_sectorsize; pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0) pp->flags |= G_PF_ACCEPT_UNMAPPED; if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0) pp->flags |= G_PF_DIRECT_SEND; pp->flags |= G_PF_DIRECT_RECEIVE; if (bootverbose) printf("GEOM: new disk %s\n", gp->name); sysctl_ctx_init(&sc->sysctl_ctx); snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name); sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, tmpstr); if (sc->sysctl_tree != NULL) { SYSCTL_ADD_STRING(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led", CTLFLAG_RWTUN, sc->led, sizeof(sc->led), "LED name"); SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "flags", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, dp, 0, g_disk_sysctl_flags, "A", "Report disk flags"); } pp->private = sc; dp->d_geom = gp; g_error_provider(pp, 0); mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_DONE; /* * If the disk has gone away at this stage, start the withering * process for it. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); g_wither_provider(pp, ENXIO); return; } mtx_pool_unlock(mtxpool_sleep, dp); } /* * We get this callback after all of the consumers have gone away, and just * before the provider is freed. If the disk driver provided a d_gone * callback, let them know that it is okay to free resources -- they won't * be getting any more accesses from GEOM. */ static void g_disk_providergone(struct g_provider *pp) { struct disk *dp; struct g_disk_softc *sc; sc = (struct g_disk_softc *)pp->private; dp = sc->dp; if (dp != NULL && dp->d_gone != NULL) dp->d_gone(dp); if (sc->sysctl_tree != NULL) { sysctl_ctx_free(&sc->sysctl_ctx); sc->sysctl_tree = NULL; } if (sc->led[0] != 0) { led_set(sc->led, "0"); sc->led[0] = 0; } pp->private = NULL; pp->geom->softc = NULL; mtx_destroy(&sc->done_mtx); g_free(sc); } static void g_disk_destroy(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_disk_softc *sc; struct disk_alias *dap, *daptmp; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (gp != NULL) { sc = gp->softc; if (sc != NULL) sc->dp = NULL; dp->d_geom = NULL; g_wither_geom(gp, ENXIO); } LIST_FOREACH_SAFE(dap, &dp->d_aliases, da_next, daptmp) g_free(dap); g_free(dp); } /* * We only allow printable characters in disk ident, * the rest is converted to 'x'. */ static void g_disk_ident_adjust(char *ident, size_t size) { char *p, tmp[4], newid[DISK_IDENT_SIZE]; newid[0] = '\0'; for (p = ident; *p != '\0'; p++) { if (isprint(*p)) { tmp[0] = *p; tmp[1] = '\0'; } else { snprintf(tmp, sizeof(tmp), "x%02hhx", *(unsigned char *)p); } if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid)) break; } bzero(ident, size); strlcpy(ident, newid, size); } struct disk * disk_alloc(void) { struct disk *dp; dp = g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO); LIST_INIT(&dp->d_aliases); return (dp); } void disk_create(struct disk *dp, int version) { if (version != DISK_VERSION) { printf("WARNING: Attempt to add disk %s%d %s", dp->d_name, dp->d_unit, " using incompatible ABI version of disk(9)\n"); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } if (dp->d_flags & DISKFLAG_RESERVED) { printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n", dp->d_name, dp->d_unit); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy")); KASSERT(dp->d_name != NULL, ("disk_create need d_name")); KASSERT(*dp->d_name != 0, ("disk_create need d_name")); KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long")); if (dp->d_devstat == NULL) dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit, dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); dp->d_geom = NULL; dp->d_init_level = DISK_INIT_NONE; g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident)); g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL); } void disk_destroy(struct disk *dp) { disk_gone(dp); dp->d_destroyed = 1; g_cancel_event(dp); if (dp->d_devstat != NULL) devstat_remove_entry(dp->d_devstat); g_post_event(g_disk_destroy, dp, M_WAITOK, NULL); } void disk_add_alias(struct disk *dp, const char *name) { struct disk_alias *dap; dap = (struct disk_alias *)g_malloc( sizeof(struct disk_alias) + strlen(name) + 1, M_WAITOK); strcpy((char *)(dap + 1), name); dap->da_alias = (const char *)(dap + 1); LIST_INSERT_HEAD(&dp->d_aliases, dap, da_next); } void disk_gone(struct disk *dp) { struct g_geom *gp; struct g_provider *pp; mtx_pool_lock(mtxpool_sleep, dp); /* * Second wither call makes no sense, plus we can not access the list * of providers without topology lock after calling wither once. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); return; } dp->d_goneflag = 1; /* * If we're still in the process of creating this disk (the * g_disk_create() function is still queued, or is in * progress), the init level will not yet be DISK_INIT_DONE. * * If that is the case, g_disk_create() will see d_goneflag * and take care of cleaning things up. * * If the disk has already been created, we default to * withering the provider as usual below. * * If the caller has not set a d_gone() callback, he will * not be any worse off by returning here, because the geom * has not been fully setup in any case. */ if (dp->d_init_level < DISK_INIT_DONE) { mtx_pool_unlock(mtxpool_sleep, dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); gp = dp->d_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_wither_provider(pp, ENXIO); } } void disk_attr_changed(struct disk *dp, const char *attr, int flag) { struct g_geom *gp; struct g_provider *pp; char devnamebuf[128]; gp = dp->d_geom; if (gp != NULL) LIST_FOREACH(pp, &gp->provider, provider) (void)g_attr_changed(pp, attr, flag); snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name, dp->d_unit); devctl_notify("GEOM", "disk", attr, devnamebuf); } void disk_media_changed(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_changed(pp, flag); } } } void disk_media_gone(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_gone(pp, flag); } } } int disk_resize(struct disk *dp, int flag) { if (dp->d_destroyed || dp->d_geom == NULL) return (0); return (g_post_event(g_disk_resize, dp, flag, NULL)); } static void g_kern_disks(void *p, int flag __unused) { struct sbuf *sb; struct g_geom *gp; char *sp; sb = p; sp = ""; g_topology_assert(); LIST_FOREACH(gp, &g_disk_class.geom, geom) { sbuf_printf(sb, "%s%s", sp, gp->name); sp = " "; } sbuf_finish(sb); } static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS) { struct disk *dp; struct sbuf *sb; int error; sb = sbuf_new_auto(); dp = (struct disk *)arg1; sbuf_printf(sb, "%b", dp->d_flags, "\20" "\2OPEN" "\3CANDELETE" "\4CANFLUSHCACHE" "\5UNMAPPEDBIO" "\6DIRECTCOMPLETION" "\10CANZONE" "\11WRITEPROTECT"); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } static int sysctl_disks(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_disks, "A", "names of available disks"); Index: head/sys/kern/kern_conf.c =================================================================== --- head/sys/kern/kern_conf.c (revision 364441) +++ head/sys/kern/kern_conf.c (revision 364442) @@ -1,1579 +1,1579 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999-2002 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include -#include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DEVT, "cdev", "cdev storage"); struct mtx devmtx; static void destroy_devl(struct cdev *dev); static int destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg); static void destroy_dev_tq(void *ctx, int pending); static int make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, va_list ap); static struct cdev_priv_list cdevp_free_list = TAILQ_HEAD_INITIALIZER(cdevp_free_list); static SLIST_HEAD(free_cdevsw, cdevsw) cdevsw_gt_post_list = SLIST_HEAD_INITIALIZER(cdevsw_gt_post_list); void dev_lock(void) { mtx_lock(&devmtx); } /* * Free all the memory collected while the cdev mutex was * locked. Since devmtx is after the system map mutex, free() cannot * be called immediately and is postponed until cdev mutex can be * dropped. */ static void dev_unlock_and_free(void) { struct cdev_priv_list cdp_free; struct free_cdevsw csw_free; struct cdev_priv *cdp; struct cdevsw *csw; dev_lock_assert_locked(); /* * Make the local copy of the list heads while the dev_mtx is * held. Free it later. */ TAILQ_INIT(&cdp_free); TAILQ_CONCAT(&cdp_free, &cdevp_free_list, cdp_list); csw_free = cdevsw_gt_post_list; SLIST_INIT(&cdevsw_gt_post_list); mtx_unlock(&devmtx); while ((cdp = TAILQ_FIRST(&cdp_free)) != NULL) { TAILQ_REMOVE(&cdp_free, cdp, cdp_list); devfs_free(&cdp->cdp_c); } while ((csw = SLIST_FIRST(&csw_free)) != NULL) { SLIST_REMOVE_HEAD(&csw_free, d_postfree_list); free(csw, M_DEVT); } } static void dev_free_devlocked(struct cdev *cdev) { struct cdev_priv *cdp; dev_lock_assert_locked(); cdp = cdev2priv(cdev); KASSERT((cdp->cdp_flags & CDP_UNREF_DTR) == 0, ("destroy_dev() was not called after delist_dev(%p)", cdev)); TAILQ_INSERT_HEAD(&cdevp_free_list, cdp, cdp_list); } static void cdevsw_free_devlocked(struct cdevsw *csw) { dev_lock_assert_locked(); SLIST_INSERT_HEAD(&cdevsw_gt_post_list, csw, d_postfree_list); } void dev_unlock(void) { mtx_unlock(&devmtx); } void dev_ref(struct cdev *dev) { dev_lock_assert_unlocked(); mtx_lock(&devmtx); dev->si_refcount++; mtx_unlock(&devmtx); } void dev_refl(struct cdev *dev) { dev_lock_assert_locked(); dev->si_refcount++; } void dev_rel(struct cdev *dev) { int flag = 0; dev_lock_assert_unlocked(); dev_lock(); dev->si_refcount--; KASSERT(dev->si_refcount >= 0, ("dev_rel(%s) gave negative count", devtoname(dev))); if (dev->si_devsw == NULL && dev->si_refcount == 0) { LIST_REMOVE(dev, si_list); flag = 1; } dev_unlock(); if (flag) devfs_free(dev); } struct cdevsw * dev_refthread(struct cdev *dev, int *ref) { struct cdevsw *csw; struct cdev_priv *cdp; dev_lock_assert_unlocked(); if ((dev->si_flags & SI_ETERNAL) != 0) { *ref = 0; return (dev->si_devsw); } cdp = cdev2priv(dev); mtx_lock(&cdp->cdp_threadlock); csw = dev->si_devsw; if (csw != NULL) { if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) atomic_add_long(&dev->si_threadcount, 1); else csw = NULL; } mtx_unlock(&cdp->cdp_threadlock); if (csw != NULL) *ref = 1; return (csw); } struct cdevsw * devvn_refthread(struct vnode *vp, struct cdev **devp, int *ref) { struct cdevsw *csw; struct cdev_priv *cdp; struct cdev *dev; dev_lock_assert_unlocked(); if ((vp->v_vflag & VV_ETERNALDEV) != 0) { dev = vp->v_rdev; if (dev == NULL) return (NULL); KASSERT((dev->si_flags & SI_ETERNAL) != 0, ("Not eternal cdev")); *ref = 0; csw = dev->si_devsw; KASSERT(csw != NULL, ("Eternal cdev is destroyed")); *devp = dev; return (csw); } csw = NULL; VI_LOCK(vp); dev = vp->v_rdev; if (dev == NULL) { VI_UNLOCK(vp); return (NULL); } cdp = cdev2priv(dev); mtx_lock(&cdp->cdp_threadlock); if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) { csw = dev->si_devsw; if (csw != NULL) atomic_add_long(&dev->si_threadcount, 1); } mtx_unlock(&cdp->cdp_threadlock); VI_UNLOCK(vp); if (csw != NULL) { *devp = dev; *ref = 1; } return (csw); } void dev_relthread(struct cdev *dev, int ref) { dev_lock_assert_unlocked(); if (!ref) return; KASSERT(dev->si_threadcount > 0, ("%s threadcount is wrong", dev->si_name)); atomic_subtract_rel_long(&dev->si_threadcount, 1); } int nullop(void) { return (0); } int eopnotsupp(void) { return (EOPNOTSUPP); } static int enxio(void) { return (ENXIO); } static int enodev(void) { return (ENODEV); } /* Define a dead_cdevsw for use when devices leave unexpectedly. */ #define dead_open (d_open_t *)enxio #define dead_close (d_close_t *)enxio #define dead_read (d_read_t *)enxio #define dead_write (d_write_t *)enxio #define dead_ioctl (d_ioctl_t *)enxio #define dead_poll (d_poll_t *)enodev #define dead_mmap (d_mmap_t *)enodev static void dead_strategy(struct bio *bp) { biofinish(bp, NULL, ENXIO); } #define dead_dump (dumper_t *)enxio #define dead_kqfilter (d_kqfilter_t *)enxio #define dead_mmap_single (d_mmap_single_t *)enodev static struct cdevsw dead_cdevsw = { .d_version = D_VERSION, .d_open = dead_open, .d_close = dead_close, .d_read = dead_read, .d_write = dead_write, .d_ioctl = dead_ioctl, .d_poll = dead_poll, .d_mmap = dead_mmap, .d_strategy = dead_strategy, .d_name = "dead", .d_dump = dead_dump, .d_kqfilter = dead_kqfilter, .d_mmap_single = dead_mmap_single }; /* Default methods if driver does not specify method */ #define null_open (d_open_t *)nullop #define null_close (d_close_t *)nullop #define no_read (d_read_t *)enodev #define no_write (d_write_t *)enodev #define no_ioctl (d_ioctl_t *)enodev #define no_mmap (d_mmap_t *)enodev #define no_kqfilter (d_kqfilter_t *)enodev #define no_mmap_single (d_mmap_single_t *)enodev static void no_strategy(struct bio *bp) { biofinish(bp, NULL, ENODEV); } static int no_poll(struct cdev *dev __unused, int events, struct thread *td __unused) { return (poll_no_poll(events)); } #define no_dump (dumper_t *)enodev static int giant_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_open(dev, oflags, devtype, td); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_fdopen(struct cdev *dev, int oflags, struct thread *td, struct file *fp) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_fdopen(dev, oflags, td, fp); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_close(dev, fflag, devtype, td); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static void giant_strategy(struct bio *bp) { struct cdevsw *dsw; struct cdev *dev; int ref; dev = bp->bio_dev; dsw = dev_refthread(dev, &ref); if (dsw == NULL) { biofinish(bp, NULL, ENXIO); return; } mtx_lock(&Giant); dsw->d_gianttrick->d_strategy(bp); mtx_unlock(&Giant); dev_relthread(dev, ref); } static int giant_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_ioctl(dev, cmd, data, fflag, td); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_read(struct cdev *dev, struct uio *uio, int ioflag) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_read(dev, uio, ioflag); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_write(struct cdev *dev, struct uio *uio, int ioflag) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_write(dev, uio, ioflag); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_poll(struct cdev *dev, int events, struct thread *td) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_poll(dev, events, td); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_kqfilter(struct cdev *dev, struct knote *kn) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_kqfilter(dev, kn); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_mmap(dev, offset, paddr, nprot, memattr); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, vm_object_t *object, int nprot) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_mmap_single(dev, offset, size, object, nprot); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static void notify(struct cdev *dev, const char *ev, int flags) { static const char prefix[] = "cdev="; char *data; int namelen, mflags; if (cold) return; mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK; namelen = strlen(dev->si_name); data = malloc(namelen + sizeof(prefix), M_TEMP, mflags); if (data == NULL) return; memcpy(data, prefix, sizeof(prefix) - 1); memcpy(data + sizeof(prefix) - 1, dev->si_name, namelen + 1); devctl_notify_f("DEVFS", "CDEV", ev, data, mflags); free(data, M_TEMP); } static void notify_create(struct cdev *dev, int flags) { notify(dev, "CREATE", flags); } static void notify_destroy(struct cdev *dev) { notify(dev, "DESTROY", MAKEDEV_WAITOK); } static struct cdev * newdev(struct make_dev_args *args, struct cdev *si) { struct cdev *si2; struct cdevsw *csw; dev_lock_assert_locked(); csw = args->mda_devsw; si2 = NULL; if (csw->d_flags & D_NEEDMINOR) { /* We may want to return an existing device */ LIST_FOREACH(si2, &csw->d_devs, si_list) { if (dev2unit(si2) == args->mda_unit) { dev_free_devlocked(si); si = si2; break; } } /* * If we're returning an existing device, we should make sure * it isn't already initialized. This would have been caught * in consumers anyways, but it's good to catch such a case * early. We still need to complete initialization of the * device, and we'll use whatever make_dev_args were passed in * to do so. */ KASSERT(si2 == NULL || (si2->si_flags & SI_NAMED) == 0, ("make_dev() by driver %s on pre-existing device (min=%x, name=%s)", args->mda_devsw->d_name, dev2unit(si2), devtoname(si2))); } si->si_drv0 = args->mda_unit; si->si_drv1 = args->mda_si_drv1; si->si_drv2 = args->mda_si_drv2; /* Only push to csw->d_devs if it's not a cloned device. */ if (si2 == NULL) { si->si_devsw = csw; LIST_INSERT_HEAD(&csw->d_devs, si, si_list); } else { KASSERT(si->si_devsw == csw, ("%s: inconsistent devsw between clone_create() and make_dev()", __func__)); } return (si); } static void fini_cdevsw(struct cdevsw *devsw) { struct cdevsw *gt; if (devsw->d_gianttrick != NULL) { gt = devsw->d_gianttrick; memcpy(devsw, gt, sizeof *devsw); cdevsw_free_devlocked(gt); devsw->d_gianttrick = NULL; } devsw->d_flags &= ~D_INIT; } static int prep_cdevsw(struct cdevsw *devsw, int flags) { struct cdevsw *dsw2; dev_lock_assert_locked(); if (devsw->d_flags & D_INIT) return (0); if (devsw->d_flags & D_NEEDGIANT) { dev_unlock(); dsw2 = malloc(sizeof *dsw2, M_DEVT, (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK); dev_lock(); if (dsw2 == NULL && !(devsw->d_flags & D_INIT)) return (ENOMEM); } else dsw2 = NULL; if (devsw->d_flags & D_INIT) { if (dsw2 != NULL) cdevsw_free_devlocked(dsw2); return (0); } if (devsw->d_version != D_VERSION_04) { printf( "WARNING: Device driver \"%s\" has wrong version %s\n", devsw->d_name == NULL ? "???" : devsw->d_name, "and is disabled. Recompile KLD module."); devsw->d_open = dead_open; devsw->d_close = dead_close; devsw->d_read = dead_read; devsw->d_write = dead_write; devsw->d_ioctl = dead_ioctl; devsw->d_poll = dead_poll; devsw->d_mmap = dead_mmap; devsw->d_mmap_single = dead_mmap_single; devsw->d_strategy = dead_strategy; devsw->d_dump = dead_dump; devsw->d_kqfilter = dead_kqfilter; } if (devsw->d_flags & D_NEEDGIANT) { printf("WARNING: Device \"%s\" is Giant locked and may be " "deleted before FreeBSD 13.0.\n", devsw->d_name == NULL ? "???" : devsw->d_name); if (devsw->d_gianttrick == NULL) { memcpy(dsw2, devsw, sizeof *dsw2); devsw->d_gianttrick = dsw2; dsw2 = NULL; } } #define FIXUP(member, noop, giant) \ do { \ if (devsw->member == NULL) { \ devsw->member = noop; \ } else if (devsw->d_flags & D_NEEDGIANT) \ devsw->member = giant; \ } \ while (0) FIXUP(d_open, null_open, giant_open); FIXUP(d_fdopen, NULL, giant_fdopen); FIXUP(d_close, null_close, giant_close); FIXUP(d_read, no_read, giant_read); FIXUP(d_write, no_write, giant_write); FIXUP(d_ioctl, no_ioctl, giant_ioctl); FIXUP(d_poll, no_poll, giant_poll); FIXUP(d_mmap, no_mmap, giant_mmap); FIXUP(d_strategy, no_strategy, giant_strategy); FIXUP(d_kqfilter, no_kqfilter, giant_kqfilter); FIXUP(d_mmap_single, no_mmap_single, giant_mmap_single); if (devsw->d_dump == NULL) devsw->d_dump = no_dump; LIST_INIT(&devsw->d_devs); devsw->d_flags |= D_INIT; if (dsw2 != NULL) cdevsw_free_devlocked(dsw2); return (0); } static int prep_devname(struct cdev *dev, const char *fmt, va_list ap) { int len; char *from, *q, *s, *to; dev_lock_assert_locked(); len = vsnrprintf(dev->si_name, sizeof(dev->si_name), 32, fmt, ap); if (len > sizeof(dev->si_name) - 1) return (ENAMETOOLONG); /* Strip leading slashes. */ for (from = dev->si_name; *from == '/'; from++) ; for (to = dev->si_name; *from != '\0'; from++, to++) { /* * Spaces and double quotation marks cause * problems for the devctl(4) protocol. * Reject names containing those characters. */ if (isspace(*from) || *from == '"') return (EINVAL); /* Treat multiple sequential slashes as single. */ while (from[0] == '/' && from[1] == '/') from++; /* Trailing slash is considered invalid. */ if (from[0] == '/' && from[1] == '\0') return (EINVAL); *to = *from; } *to = '\0'; if (dev->si_name[0] == '\0') return (EINVAL); /* Disallow "." and ".." components. */ for (s = dev->si_name;;) { for (q = s; *q != '/' && *q != '\0'; q++) ; if (q - s == 1 && s[0] == '.') return (EINVAL); if (q - s == 2 && s[0] == '.' && s[1] == '.') return (EINVAL); if (*q != '/') break; s = q + 1; } if (devfs_dev_exists(dev->si_name) != 0) return (EEXIST); return (0); } void make_dev_args_init_impl(struct make_dev_args *args, size_t sz) { bzero(args, sz); args->mda_size = sz; } static int make_dev_sv(struct make_dev_args *args1, struct cdev **dres, const char *fmt, va_list ap) { struct cdev *dev, *dev_new; struct make_dev_args args; int res; bzero(&args, sizeof(args)); if (sizeof(args) < args1->mda_size) return (EINVAL); bcopy(args1, &args, args1->mda_size); KASSERT((args.mda_flags & MAKEDEV_WAITOK) == 0 || (args.mda_flags & MAKEDEV_NOWAIT) == 0, ("make_dev_sv: both WAITOK and NOWAIT specified")); dev_new = devfs_alloc(args.mda_flags); if (dev_new == NULL) return (ENOMEM); dev_lock(); res = prep_cdevsw(args.mda_devsw, args.mda_flags); if (res != 0) { dev_unlock(); devfs_free(dev_new); return (res); } dev = newdev(&args, dev_new); if ((dev->si_flags & SI_NAMED) == 0) { res = prep_devname(dev, fmt, ap); if (res != 0) { if ((args.mda_flags & MAKEDEV_CHECKNAME) == 0) { panic( "make_dev_sv: bad si_name (error=%d, si_name=%s)", res, dev->si_name); } if (dev == dev_new) { LIST_REMOVE(dev, si_list); dev_unlock(); devfs_free(dev); } else dev_unlock(); return (res); } } if ((args.mda_flags & MAKEDEV_REF) != 0) dev_refl(dev); if ((args.mda_flags & MAKEDEV_ETERNAL) != 0) dev->si_flags |= SI_ETERNAL; KASSERT(!(dev->si_flags & SI_NAMED), ("make_dev() by driver %s on pre-existing device (min=%x, name=%s)", args.mda_devsw->d_name, dev2unit(dev), devtoname(dev))); dev->si_flags |= SI_NAMED; if (args.mda_cr != NULL) dev->si_cred = crhold(args.mda_cr); dev->si_uid = args.mda_uid; dev->si_gid = args.mda_gid; dev->si_mode = args.mda_mode; devfs_create(dev); clean_unrhdrl(devfs_inos); dev_unlock_and_free(); notify_create(dev, args.mda_flags); *dres = dev; return (0); } int make_dev_s(struct make_dev_args *args, struct cdev **dres, const char *fmt, ...) { va_list ap; int res; va_start(ap, fmt); res = make_dev_sv(args, dres, fmt, ap); va_end(ap); return (res); } static int make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, va_list ap) { struct make_dev_args args; make_dev_args_init(&args); args.mda_flags = flags; args.mda_devsw = devsw; args.mda_cr = cr; args.mda_uid = uid; args.mda_gid = gid; args.mda_mode = mode; args.mda_unit = unit; return (make_dev_sv(&args, dres, fmt, ap)); } struct cdev * make_dev(struct cdevsw *devsw, int unit, uid_t uid, gid_t gid, int mode, const char *fmt, ...) { struct cdev *dev; va_list ap; int res __unused; va_start(ap, fmt); res = make_dev_credv(0, &dev, devsw, unit, NULL, uid, gid, mode, fmt, ap); va_end(ap); KASSERT(res == 0 && dev != NULL, ("make_dev: failed make_dev_credv (error=%d)", res)); return (dev); } struct cdev * make_dev_cred(struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...) { struct cdev *dev; va_list ap; int res __unused; va_start(ap, fmt); res = make_dev_credv(0, &dev, devsw, unit, cr, uid, gid, mode, fmt, ap); va_end(ap); KASSERT(res == 0 && dev != NULL, ("make_dev_cred: failed make_dev_credv (error=%d)", res)); return (dev); } struct cdev * make_dev_credf(int flags, struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...) { struct cdev *dev; va_list ap; int res; va_start(ap, fmt); res = make_dev_credv(flags, &dev, devsw, unit, cr, uid, gid, mode, fmt, ap); va_end(ap); KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) || ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0, ("make_dev_credf: failed make_dev_credv (error=%d)", res)); return (res == 0 ? dev : NULL); } int make_dev_p(int flags, struct cdev **cdev, struct cdevsw *devsw, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...) { va_list ap; int res; va_start(ap, fmt); res = make_dev_credv(flags, cdev, devsw, 0, cr, uid, gid, mode, fmt, ap); va_end(ap); KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) || ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0, ("make_dev_p: failed make_dev_credv (error=%d)", res)); return (res); } static void dev_dependsl(struct cdev *pdev, struct cdev *cdev) { cdev->si_parent = pdev; cdev->si_flags |= SI_CHILD; LIST_INSERT_HEAD(&pdev->si_children, cdev, si_siblings); } void dev_depends(struct cdev *pdev, struct cdev *cdev) { dev_lock(); dev_dependsl(pdev, cdev); dev_unlock(); } static int make_dev_alias_v(int flags, struct cdev **cdev, struct cdev *pdev, const char *fmt, va_list ap) { struct cdev *dev; int error; KASSERT(pdev != NULL, ("make_dev_alias_v: pdev is NULL")); KASSERT((flags & MAKEDEV_WAITOK) == 0 || (flags & MAKEDEV_NOWAIT) == 0, ("make_dev_alias_v: both WAITOK and NOWAIT specified")); KASSERT((flags & ~(MAKEDEV_WAITOK | MAKEDEV_NOWAIT | MAKEDEV_CHECKNAME)) == 0, ("make_dev_alias_v: invalid flags specified (flags=%02x)", flags)); dev = devfs_alloc(flags); if (dev == NULL) return (ENOMEM); dev_lock(); dev->si_flags |= SI_ALIAS; error = prep_devname(dev, fmt, ap); if (error != 0) { if ((flags & MAKEDEV_CHECKNAME) == 0) { panic("make_dev_alias_v: bad si_name " "(error=%d, si_name=%s)", error, dev->si_name); } dev_unlock(); devfs_free(dev); return (error); } dev->si_flags |= SI_NAMED; devfs_create(dev); dev_dependsl(pdev, dev); clean_unrhdrl(devfs_inos); dev_unlock(); notify_create(dev, flags); *cdev = dev; return (0); } struct cdev * make_dev_alias(struct cdev *pdev, const char *fmt, ...) { struct cdev *dev; va_list ap; int res __unused; va_start(ap, fmt); res = make_dev_alias_v(MAKEDEV_WAITOK, &dev, pdev, fmt, ap); va_end(ap); KASSERT(res == 0 && dev != NULL, ("make_dev_alias: failed make_dev_alias_v (error=%d)", res)); return (dev); } int make_dev_alias_p(int flags, struct cdev **cdev, struct cdev *pdev, const char *fmt, ...) { va_list ap; int res; va_start(ap, fmt); res = make_dev_alias_v(flags, cdev, pdev, fmt, ap); va_end(ap); return (res); } int make_dev_physpath_alias(int flags, struct cdev **cdev, struct cdev *pdev, struct cdev *old_alias, const char *physpath) { char *devfspath; int physpath_len; int max_parentpath_len; int parentpath_len; int devfspathbuf_len; int mflags; int ret; *cdev = NULL; devfspath = NULL; physpath_len = strlen(physpath); ret = EINVAL; if (physpath_len == 0) goto out; if (strncmp("id1,", physpath, 4) == 0) { physpath += 4; physpath_len -= 4; if (physpath_len == 0) goto out; } max_parentpath_len = SPECNAMELEN - physpath_len - /*/*/1; parentpath_len = strlen(pdev->si_name); if (max_parentpath_len < parentpath_len) { if (bootverbose) printf("WARNING: Unable to alias %s " "to %s/%s - path too long\n", pdev->si_name, physpath, pdev->si_name); ret = ENAMETOOLONG; goto out; } mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK; devfspathbuf_len = physpath_len + /*/*/1 + parentpath_len + /*NUL*/1; devfspath = malloc(devfspathbuf_len, M_DEVBUF, mflags); if (devfspath == NULL) { ret = ENOMEM; goto out; } sprintf(devfspath, "%s/%s", physpath, pdev->si_name); if (old_alias != NULL && strcmp(old_alias->si_name, devfspath) == 0) { /* Retain the existing alias. */ *cdev = old_alias; old_alias = NULL; ret = 0; } else { ret = make_dev_alias_p(flags, cdev, pdev, "%s", devfspath); } out: if (old_alias != NULL) destroy_dev(old_alias); if (devfspath != NULL) free(devfspath, M_DEVBUF); return (ret); } static void destroy_devl(struct cdev *dev) { struct cdevsw *csw; struct cdev_privdata *p; struct cdev_priv *cdp; dev_lock_assert_locked(); KASSERT(dev->si_flags & SI_NAMED, ("WARNING: Driver mistake: destroy_dev on %d\n", dev2unit(dev))); KASSERT((dev->si_flags & SI_ETERNAL) == 0, ("WARNING: Driver mistake: destroy_dev on eternal %d\n", dev2unit(dev))); cdp = cdev2priv(dev); if ((cdp->cdp_flags & CDP_UNREF_DTR) == 0) { /* * Avoid race with dev_rel(), e.g. from the populate * loop. If CDP_UNREF_DTR flag is set, the reference * to be dropped at the end of destroy_devl() was * already taken by delist_dev_locked(). */ dev_refl(dev); devfs_destroy(dev); } /* Remove name marking */ dev->si_flags &= ~SI_NAMED; /* If we are a child, remove us from the parents list */ if (dev->si_flags & SI_CHILD) { LIST_REMOVE(dev, si_siblings); dev->si_flags &= ~SI_CHILD; } /* Kill our children */ while (!LIST_EMPTY(&dev->si_children)) destroy_devl(LIST_FIRST(&dev->si_children)); /* Remove from clone list */ if (dev->si_flags & SI_CLONELIST) { LIST_REMOVE(dev, si_clone); dev->si_flags &= ~SI_CLONELIST; } mtx_lock(&cdp->cdp_threadlock); csw = dev->si_devsw; dev->si_devsw = NULL; /* already NULL for SI_ALIAS */ while (csw != NULL && csw->d_purge != NULL && dev->si_threadcount) { csw->d_purge(dev); mtx_unlock(&cdp->cdp_threadlock); msleep(csw, &devmtx, PRIBIO, "devprg", hz/10); mtx_lock(&cdp->cdp_threadlock); if (dev->si_threadcount) printf("Still %lu threads in %s\n", dev->si_threadcount, devtoname(dev)); } while (dev->si_threadcount != 0) { /* Use unique dummy wait ident */ mtx_unlock(&cdp->cdp_threadlock); msleep(&csw, &devmtx, PRIBIO, "devdrn", hz / 10); mtx_lock(&cdp->cdp_threadlock); } mtx_unlock(&cdp->cdp_threadlock); dev_unlock(); if ((cdp->cdp_flags & CDP_UNREF_DTR) == 0) { /* avoid out of order notify events */ notify_destroy(dev); } mtx_lock(&cdevpriv_mtx); while ((p = LIST_FIRST(&cdp->cdp_fdpriv)) != NULL) { devfs_destroy_cdevpriv(p); mtx_lock(&cdevpriv_mtx); } mtx_unlock(&cdevpriv_mtx); dev_lock(); dev->si_drv1 = 0; dev->si_drv2 = 0; bzero(&dev->__si_u, sizeof(dev->__si_u)); if (!(dev->si_flags & SI_ALIAS)) { /* Remove from cdevsw list */ LIST_REMOVE(dev, si_list); /* If cdevsw has no more struct cdev *'s, clean it */ if (LIST_EMPTY(&csw->d_devs)) { fini_cdevsw(csw); wakeup(&csw->d_devs); } } dev->si_flags &= ~SI_ALIAS; cdp->cdp_flags &= ~CDP_UNREF_DTR; dev->si_refcount--; if (dev->si_refcount > 0) LIST_INSERT_HEAD(&dead_cdevsw.d_devs, dev, si_list); else dev_free_devlocked(dev); } static void delist_dev_locked(struct cdev *dev) { struct cdev_priv *cdp; struct cdev *child; dev_lock_assert_locked(); cdp = cdev2priv(dev); if ((cdp->cdp_flags & CDP_UNREF_DTR) != 0) return; cdp->cdp_flags |= CDP_UNREF_DTR; dev_refl(dev); devfs_destroy(dev); LIST_FOREACH(child, &dev->si_children, si_siblings) delist_dev_locked(child); dev_unlock(); /* ensure the destroy event is queued in order */ notify_destroy(dev); dev_lock(); } /* * This function will delist a character device and its children from * the directory listing and create a destroy event without waiting * for all character device references to go away. At some later point * destroy_dev() must be called to complete the character device * destruction. After calling this function the character device name * can instantly be re-used. */ void delist_dev(struct cdev *dev) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "delist_dev"); dev_lock(); delist_dev_locked(dev); dev_unlock(); } void destroy_dev(struct cdev *dev) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "destroy_dev"); dev_lock(); destroy_devl(dev); dev_unlock_and_free(); } const char * devtoname(struct cdev *dev) { return (dev->si_name); } int dev_stdclone(char *name, char **namep, const char *stem, int *unit) { int u, i; i = strlen(stem); if (bcmp(stem, name, i) != 0) return (0); if (!isdigit(name[i])) return (0); u = 0; if (name[i] == '0' && isdigit(name[i+1])) return (0); while (isdigit(name[i])) { u *= 10; u += name[i++] - '0'; } if (u > 0xffffff) return (0); *unit = u; if (namep) *namep = &name[i]; if (name[i]) return (2); return (1); } /* * Helper functions for cloning device drivers. * * The objective here is to make it unnecessary for the device drivers to * use rman or similar to manage their unit number space. Due to the way * we do "on-demand" devices, using rman or other "private" methods * will be very tricky to lock down properly once we lock down this file. * * Instead we give the drivers these routines which puts the struct cdev *'s * that are to be managed on their own list, and gives the driver the ability * to ask for the first free unit number or a given specified unit number. * * In addition these routines support paired devices (pty, nmdm and similar) * by respecting a number of "flag" bits in the minor number. * */ struct clonedevs { LIST_HEAD(,cdev) head; }; void clone_setup(struct clonedevs **cdp) { *cdp = malloc(sizeof **cdp, M_DEVBUF, M_WAITOK | M_ZERO); LIST_INIT(&(*cdp)->head); } int clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up, struct cdev **dp, int extra) { struct clonedevs *cd; struct cdev *dev, *ndev, *dl, *de; struct make_dev_args args; int unit, low, u; KASSERT(*cdp != NULL, ("clone_setup() not called in driver \"%s\"", csw->d_name)); KASSERT(!(extra & CLONE_UNITMASK), ("Illegal extra bits (0x%x) in clone_create", extra)); KASSERT(*up <= CLONE_UNITMASK, ("Too high unit (0x%x) in clone_create", *up)); KASSERT(csw->d_flags & D_NEEDMINOR, ("clone_create() on cdevsw without minor numbers")); /* * Search the list for a lot of things in one go: * A preexisting match is returned immediately. * The lowest free unit number if we are passed -1, and the place * in the list where we should insert that new element. * The place to insert a specified unit number, if applicable * the end of the list. */ unit = *up; ndev = devfs_alloc(MAKEDEV_WAITOK); dev_lock(); prep_cdevsw(csw, MAKEDEV_WAITOK); low = extra; de = dl = NULL; cd = *cdp; LIST_FOREACH(dev, &cd->head, si_clone) { KASSERT(dev->si_flags & SI_CLONELIST, ("Dev %p(%s) should be on clonelist", dev, dev->si_name)); u = dev2unit(dev); if (u == (unit | extra)) { *dp = dev; dev_unlock(); devfs_free(ndev); return (0); } if (unit == -1 && u == low) { low++; de = dev; continue; } else if (u < (unit | extra)) { de = dev; continue; } else if (u > (unit | extra)) { dl = dev; break; } } if (unit == -1) unit = low & CLONE_UNITMASK; make_dev_args_init(&args); args.mda_unit = unit | extra; args.mda_devsw = csw; dev = newdev(&args, ndev); if (dev->si_flags & SI_CLONELIST) { printf("dev %p (%s) is on clonelist\n", dev, dev->si_name); printf("unit=%d, low=%d, extra=0x%x\n", unit, low, extra); LIST_FOREACH(dev, &cd->head, si_clone) { printf("\t%p %s\n", dev, dev->si_name); } panic("foo"); } KASSERT(!(dev->si_flags & SI_CLONELIST), ("Dev %p(%s) should not be on clonelist", dev, dev->si_name)); if (dl != NULL) LIST_INSERT_BEFORE(dl, dev, si_clone); else if (de != NULL) LIST_INSERT_AFTER(de, dev, si_clone); else LIST_INSERT_HEAD(&cd->head, dev, si_clone); dev->si_flags |= SI_CLONELIST; *up = unit; dev_unlock_and_free(); return (1); } /* * Kill everything still on the list. The driver should already have * disposed of any softc hung of the struct cdev *'s at this time. */ void clone_cleanup(struct clonedevs **cdp) { struct cdev *dev; struct cdev_priv *cp; struct clonedevs *cd; cd = *cdp; if (cd == NULL) return; dev_lock(); while (!LIST_EMPTY(&cd->head)) { dev = LIST_FIRST(&cd->head); LIST_REMOVE(dev, si_clone); KASSERT(dev->si_flags & SI_CLONELIST, ("Dev %p(%s) should be on clonelist", dev, dev->si_name)); dev->si_flags &= ~SI_CLONELIST; cp = cdev2priv(dev); if (!(cp->cdp_flags & CDP_SCHED_DTR)) { cp->cdp_flags |= CDP_SCHED_DTR; KASSERT(dev->si_flags & SI_NAMED, ("Driver has goofed in cloning underways udev %jx unit %x", (uintmax_t)dev2udev(dev), dev2unit(dev))); destroy_devl(dev); } } dev_unlock_and_free(); free(cd, M_DEVBUF); *cdp = NULL; } static TAILQ_HEAD(, cdev_priv) dev_ddtr = TAILQ_HEAD_INITIALIZER(dev_ddtr); static struct task dev_dtr_task = TASK_INITIALIZER(0, destroy_dev_tq, NULL); static void destroy_dev_tq(void *ctx, int pending) { struct cdev_priv *cp; struct cdev *dev; void (*cb)(void *); void *cb_arg; dev_lock(); while (!TAILQ_EMPTY(&dev_ddtr)) { cp = TAILQ_FIRST(&dev_ddtr); dev = &cp->cdp_c; KASSERT(cp->cdp_flags & CDP_SCHED_DTR, ("cdev %p in dev_destroy_tq without CDP_SCHED_DTR", cp)); TAILQ_REMOVE(&dev_ddtr, cp, cdp_dtr_list); cb = cp->cdp_dtr_cb; cb_arg = cp->cdp_dtr_cb_arg; destroy_devl(dev); dev_unlock_and_free(); dev_rel(dev); if (cb != NULL) cb(cb_arg); dev_lock(); } dev_unlock(); } /* * devmtx shall be locked on entry. devmtx will be unlocked after * function return. */ static int destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg) { struct cdev_priv *cp; dev_lock_assert_locked(); cp = cdev2priv(dev); if (cp->cdp_flags & CDP_SCHED_DTR) { dev_unlock(); return (0); } dev_refl(dev); cp->cdp_flags |= CDP_SCHED_DTR; cp->cdp_dtr_cb = cb; cp->cdp_dtr_cb_arg = arg; TAILQ_INSERT_TAIL(&dev_ddtr, cp, cdp_dtr_list); dev_unlock(); taskqueue_enqueue(taskqueue_swi_giant, &dev_dtr_task); return (1); } int destroy_dev_sched_cb(struct cdev *dev, void (*cb)(void *), void *arg) { dev_lock(); return (destroy_dev_sched_cbl(dev, cb, arg)); } int destroy_dev_sched(struct cdev *dev) { return (destroy_dev_sched_cb(dev, NULL, NULL)); } void destroy_dev_drain(struct cdevsw *csw) { dev_lock(); while (!LIST_EMPTY(&csw->d_devs)) { msleep(&csw->d_devs, &devmtx, PRIBIO, "devscd", hz/10); } dev_unlock(); } void drain_dev_clone_events(void) { sx_xlock(&clone_drain_lock); sx_xunlock(&clone_drain_lock); } #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(cdev, db_show_cdev) { struct cdev_priv *cdp; struct cdev *dev; u_int flags; char buf[512]; if (!have_addr) { TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) { dev = &cdp->cdp_c; db_printf("%s %p\n", dev->si_name, dev); if (db_pager_quit) break; } return; } dev = (struct cdev *)addr; cdp = cdev2priv(dev); db_printf("dev %s ref %d use %ld thr %ld inuse %u fdpriv %p\n", dev->si_name, dev->si_refcount, dev->si_usecount, dev->si_threadcount, cdp->cdp_inuse, cdp->cdp_fdpriv.lh_first); db_printf("devsw %p si_drv0 %d si_drv1 %p si_drv2 %p\n", dev->si_devsw, dev->si_drv0, dev->si_drv1, dev->si_drv2); flags = dev->si_flags; #define SI_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 3, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) buf[0] = '\0'; SI_FLAG(SI_ETERNAL); SI_FLAG(SI_ALIAS); SI_FLAG(SI_NAMED); SI_FLAG(SI_CHILD); SI_FLAG(SI_DUMPDEV); SI_FLAG(SI_CLONELIST); db_printf("si_flags %s\n", buf); flags = cdp->cdp_flags; #define CDP_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) buf[0] = '\0'; CDP_FLAG(CDP_ACTIVE); CDP_FLAG(CDP_SCHED_DTR); db_printf("cdp_flags %s\n", buf); } #endif Index: head/sys/kern/kern_rctl.c =================================================================== --- head/sys/kern/kern_rctl.c (revision 364441) +++ head/sys/kern/kern_rctl.c (revision 364442) @@ -1,2243 +1,2243 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #ifndef RACCT #error "The RCTL option requires the RACCT option" #endif FEATURE(rctl, "Resource Limits"); #define HRF_DEFAULT 0 #define HRF_DONT_INHERIT 1 #define HRF_DONT_ACCUMULATE 2 #define RCTL_MAX_INBUFSIZE 4 * 1024 #define RCTL_MAX_OUTBUFSIZE 16 * 1024 * 1024 #define RCTL_LOG_BUFSIZE 128 #define RCTL_PCPU_SHIFT (10 * 1000000) static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; static int rctl_log_rate_limit = 10; static int rctl_devctl_rate_limit = 10; /* * Values below are initialized in rctl_init(). */ static int rctl_throttle_min = -1; static int rctl_throttle_max = -1; static int rctl_throttle_pct = -1; static int rctl_throttle_pct2 = -1; static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, &rctl_maxbufsize, 0, "Maximum output buffer size"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW, &rctl_log_rate_limit, 0, "Maximum number of log messages per second"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN, &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second"); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_min_sysctl, "IU", "Shortest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_max_sysctl, "IU", "Longest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_pct_sysctl, "IU", "Throttling penalty for process consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_pct2_sysctl, "IU", "Throttling penalty for container consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2); /* * 'rctl_rule_link' connects a rule with every racct it's related to. * For example, rule 'user:X:openfiles:deny=N/process' is linked * with uidinfo for user X, and to each process of that user. */ struct rctl_rule_link { LIST_ENTRY(rctl_rule_link) rrl_next; struct rctl_rule *rrl_rule; int rrl_exceeded; }; struct dict { const char *d_name; int d_value; }; static struct dict subjectnames[] = { { "process", RCTL_SUBJECT_TYPE_PROCESS }, { "user", RCTL_SUBJECT_TYPE_USER }, { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS }, { "jail", RCTL_SUBJECT_TYPE_JAIL }, { NULL, -1 }}; static struct dict resourcenames[] = { { "cputime", RACCT_CPU }, { "datasize", RACCT_DATA }, { "stacksize", RACCT_STACK }, { "coredumpsize", RACCT_CORE }, { "memoryuse", RACCT_RSS }, { "memorylocked", RACCT_MEMLOCK }, { "maxproc", RACCT_NPROC }, { "openfiles", RACCT_NOFILE }, { "vmemoryuse", RACCT_VMEM }, { "pseudoterminals", RACCT_NPTS }, { "swapuse", RACCT_SWAP }, { "nthr", RACCT_NTHR }, { "msgqqueued", RACCT_MSGQQUEUED }, { "msgqsize", RACCT_MSGQSIZE }, { "nmsgq", RACCT_NMSGQ }, { "nsem", RACCT_NSEM }, { "nsemop", RACCT_NSEMOP }, { "nshm", RACCT_NSHM }, { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, { "readbps", RACCT_READBPS }, { "writebps", RACCT_WRITEBPS }, { "readiops", RACCT_READIOPS }, { "writeiops", RACCT_WRITEIOPS }, { NULL, -1 }}; static struct dict actionnames[] = { { "sighup", RCTL_ACTION_SIGHUP }, { "sigint", RCTL_ACTION_SIGINT }, { "sigquit", RCTL_ACTION_SIGQUIT }, { "sigill", RCTL_ACTION_SIGILL }, { "sigtrap", RCTL_ACTION_SIGTRAP }, { "sigabrt", RCTL_ACTION_SIGABRT }, { "sigemt", RCTL_ACTION_SIGEMT }, { "sigfpe", RCTL_ACTION_SIGFPE }, { "sigkill", RCTL_ACTION_SIGKILL }, { "sigbus", RCTL_ACTION_SIGBUS }, { "sigsegv", RCTL_ACTION_SIGSEGV }, { "sigsys", RCTL_ACTION_SIGSYS }, { "sigpipe", RCTL_ACTION_SIGPIPE }, { "sigalrm", RCTL_ACTION_SIGALRM }, { "sigterm", RCTL_ACTION_SIGTERM }, { "sigurg", RCTL_ACTION_SIGURG }, { "sigstop", RCTL_ACTION_SIGSTOP }, { "sigtstp", RCTL_ACTION_SIGTSTP }, { "sigchld", RCTL_ACTION_SIGCHLD }, { "sigttin", RCTL_ACTION_SIGTTIN }, { "sigttou", RCTL_ACTION_SIGTTOU }, { "sigio", RCTL_ACTION_SIGIO }, { "sigxcpu", RCTL_ACTION_SIGXCPU }, { "sigxfsz", RCTL_ACTION_SIGXFSZ }, { "sigvtalrm", RCTL_ACTION_SIGVTALRM }, { "sigprof", RCTL_ACTION_SIGPROF }, { "sigwinch", RCTL_ACTION_SIGWINCH }, { "siginfo", RCTL_ACTION_SIGINFO }, { "sigusr1", RCTL_ACTION_SIGUSR1 }, { "sigusr2", RCTL_ACTION_SIGUSR2 }, { "sigthr", RCTL_ACTION_SIGTHR }, { "deny", RCTL_ACTION_DENY }, { "log", RCTL_ACTION_LOG }, { "devctl", RCTL_ACTION_DEVCTL }, { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; static void rctl_init(void); SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL); static uma_zone_t rctl_rule_zone; static uma_zone_t rctl_rule_link_zone; static int rctl_rule_fully_specified(const struct rctl_rule *rule); static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule); static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits"); static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_min; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 1 || val > rctl_throttle_max) return (EINVAL); RACCT_LOCK(); rctl_throttle_min = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_max; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < rctl_throttle_min) return (EINVAL); RACCT_LOCK(); rctl_throttle_max = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct2; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct2 = val; RACCT_UNLOCK(); return (0); } static const char * rctl_subject_type_name(int subject) { int i; for (i = 0; subjectnames[i].d_name != NULL; i++) { if (subjectnames[i].d_value == subject) return (subjectnames[i].d_name); } panic("rctl_subject_type_name: unknown subject type %d", subject); } static const char * rctl_action_name(int action) { int i; for (i = 0; actionnames[i].d_name != NULL; i++) { if (actionnames[i].d_value == action) return (actionnames[i].d_name); } panic("rctl_action_name: unknown action %d", action); } const char * rctl_resource_name(int resource) { int i; for (i = 0; resourcenames[i].d_name != NULL; i++) { if (resourcenames[i].d_value == resource) return (resourcenames[i].d_name); } panic("rctl_resource_name: unknown resource %d", resource); } static struct racct * rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) { struct ucred *cred = p->p_ucred; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); switch (rule->rr_per) { case RCTL_SUBJECT_TYPE_PROCESS: return (p->p_racct); case RCTL_SUBJECT_TYPE_USER: return (cred->cr_ruidinfo->ui_racct); case RCTL_SUBJECT_TYPE_LOGINCLASS: return (cred->cr_loginclass->lc_racct); case RCTL_SUBJECT_TYPE_JAIL: return (cred->cr_prison->pr_prison_racct->prr_racct); default: panic("%s: unknown per %d", __func__, rule->rr_per); } } /* * Return the amount of resource that can be allocated by 'p' before * hitting 'rule'. */ static int64_t rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) { const struct racct *racct; int64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); racct = rctl_proc_rule_to_racct(p, rule); available = rule->rr_amount - racct->r_resources[rule->rr_resource]; return (available); } /* * Called every second for proc, uidinfo, loginclass, and jail containers. * If the limit isn't exceeded, it decreases the usage amount to zero. * Otherwise, it decreases it by the value of the limit. This way * resource consumption exceeding the limit "carries over" to the next * period. */ void rctl_throttle_decay(struct racct *racct, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t minavailable; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_THROTTLE) continue; if (rule->rr_amount < minavailable) minavailable = rule->rr_amount; } if (racct->r_resources[resource] < minavailable) { racct->r_resources[resource] = 0; } else { /* * Cap utilization counter at ten times the limit. Otherwise, * if we changed the rule lowering the allowed amount, it could * take unreasonably long time for the accumulated resource * usage to drop. */ if (racct->r_resources[resource] > minavailable * 10) racct->r_resources[resource] = minavailable * 10; racct->r_resources[resource] -= minavailable; } } /* * Special version of rctl_get_available() for the %CPU resource. * We slightly cheat here and return less than we normally would. */ int64_t rctl_pcpu_available(const struct proc *p) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, limit; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; limit = 0; LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != RACCT_PCTCPU) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) { minavailable = available; limit = rule->rr_amount; } } /* * Return slightly less than actual value of the available * %cpu resource. This makes %cpu throttling more aggressive * and lets us act sooner than the limits are already exceeded. */ if (limit != 0) { if (limit > 2 * RCTL_PCPU_SHIFT) minavailable -= RCTL_PCPU_SHIFT; else minavailable -= (limit / 2); } return (minavailable); } static uint64_t xadd(uint64_t a, uint64_t b) { uint64_t c; c = a + b; /* * Detect overflow. */ if (c < a || c < b) return (UINT64_MAX); return (c); } static uint64_t xmul(uint64_t a, uint64_t b) { if (b != 0 && a > UINT64_MAX / b) return (UINT64_MAX); return (a * b); } /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should * be denied, 0 otherwise. */ int rctl_enforce(struct proc *p, int resource, uint64_t amount) { static struct timeval log_lasttime, devctl_lasttime; static int log_curtime = 0, devctl_curtime = 0; struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; char *buf; int64_t available; uint64_t sleep_ms, sleep_ratio; int should_deny = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; available = rctl_available_resource(p, rule); if (available >= (int64_t)amount) { link->rrl_exceeded = 0; continue; } switch (rule->rr_action) { case RCTL_ACTION_DENY: should_deny = 1; continue; case RCTL_ACTION_LOG: /* * If rrl_exceeded != 0, it means we've already * logged a warning for this process. */ if (link->rrl_exceeded != 0) continue; /* * If the process state is not fully initialized yet, * we can't access most of the required fields, e.g. * p->p_comm. This happens when called from fork1(). * Ignore this rule for now; it will be processed just * after fork, when called from racct_proc_fork_done(). */ if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&log_lasttime, &log_curtime, rctl_log_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); rctl_rule_to_sbuf(&sb, rule); sbuf_finish(&sb); printf("rctl: rule \"%s\" matched by pid %d " "(%s), uid %d, jail %s\n", sbuf_data(&sb), p->p_pid, p->p_comm, p->p_ucred->cr_uid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_DEVCTL: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, rctl_devctl_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); sbuf_printf(&sb, "rule="); rctl_rule_to_sbuf(&sb, rule); sbuf_printf(&sb, " pid=%d ruid=%d jail=%s", p->p_pid, p->p_ucred->cr_ruid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_finish(&sb); devctl_notify_f("RCTL", "rule", "matched", sbuf_data(&sb), M_NOWAIT); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_THROTTLE: if (p->p_state != PRS_NORMAL) continue; /* * Make the process sleep for a fraction of second * proportional to the ratio of process' resource * utilization compared to the limit. The point is * to penalize resource hogs: processes that consume * more of the available resources sleep for longer. * * We're trying to defer division until the very end, * to minimize the rounding effects. The following * calculation could have been written in a clearer * way like this: * * sleep_ms = hz * p->p_racct->r_resources[resource] / * rule->rr_amount; * sleep_ms *= rctl_throttle_pct / 100; * if (sleep_ms < rctl_throttle_min) * sleep_ms = rctl_throttle_min; * */ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; if (sleep_ms < rctl_throttle_min * rule->rr_amount) sleep_ms = rctl_throttle_min * rule->rr_amount; /* * Multiply that by the ratio of the resource * consumption for the container compared to the limit, * squared. In other words, a process in a container * that is two times over the limit will be throttled * four times as much for hitting the same rule. The * point is to penalize processes more if the container * itself (eg certain UID or jail) is above the limit. */ if (available < 0) sleep_ratio = -available / rule->rr_amount; else sleep_ratio = 0; sleep_ratio = xmul(sleep_ratio, sleep_ratio); sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); /* * Finally the division. */ sleep_ms /= rule->rr_amount; if (sleep_ms > rctl_throttle_max) sleep_ms = rctl_throttle_max; #if 0 printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n", __func__, p->p_pid, p->p_comm, p->p_racct->r_resources[resource], rule->rr_amount, (uintmax_t)sleep_ms, (uintmax_t)sleep_ratio, (intmax_t)available); #endif KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n", __func__, (uintmax_t)sleep_ms, rctl_throttle_min)); racct_proc_throttle(p, sleep_ms); continue; default: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; KASSERT(rule->rr_action > 0 && rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, ("rctl_enforce: unknown action %d", rule->rr_action)); /* * We're using the fact that RCTL_ACTION_SIG* values * are equal to their counterparts from sys/signal.h. */ kern_psignal(p, rule->rr_action); link->rrl_exceeded = 1; continue; } } if (should_deny) { /* * Return fake error code; the caller should change it * into one proper for the situation - EFSIZ, ENOMEM etc. */ return (EDOOFUS); } return (0); } uint64_t rctl_get_limit(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; uint64_t amount = UINT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; if (rule->rr_amount < amount) amount = rule->rr_amount; } return (amount); } uint64_t rctl_get_available(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, allocated; minavailable = INT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) minavailable = available; } /* * XXX: Think about this _hard_. */ allocated = p->p_racct->r_resources[resource]; if (minavailable < INT64_MAX - allocated) minavailable += allocated; if (minavailable < 0) minavailable = 0; return (minavailable); } static int rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter) { ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_subject_type != filter->rr_subject_type) return (0); switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (filter->rr_subject.rs_proc != NULL && rule->rr_subject.rs_proc != filter->rr_subject.rs_proc) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (filter->rr_subject.rs_uip != NULL && rule->rr_subject.rs_uip != filter->rr_subject.rs_uip) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (filter->rr_subject.rs_loginclass != NULL && rule->rr_subject.rs_loginclass != filter->rr_subject.rs_loginclass) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (filter->rr_subject.rs_prison_racct != NULL && rule->rr_subject.rs_prison_racct != filter->rr_subject.rs_prison_racct) return (0); break; default: panic("rctl_rule_matches: unknown subject type %d", filter->rr_subject_type); } } if (filter->rr_resource != RACCT_UNDEFINED) { if (rule->rr_resource != filter->rr_resource) return (0); } if (filter->rr_action != RCTL_ACTION_UNDEFINED) { if (rule->rr_action != filter->rr_action) return (0); } if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) { if (rule->rr_amount != filter->rr_amount) return (0); } if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_per != filter->rr_per) return (0); } return (1); } static int str2value(const char *str, int *value, struct dict *table) { int i; if (value == NULL) return (EINVAL); for (i = 0; table[i].d_name != NULL; i++) { if (strcasecmp(table[i].d_name, str) == 0) { *value = table[i].d_value; return (0); } } return (EINVAL); } static int str2id(const char *str, id_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); return (0); } static int str2int64(const char *str, int64_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); if (*value < 0) return (ERANGE); return (0); } /* * Connect the rule to the racct, increasing refcount for the rule. */ static void rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rctl_rule_acquire(rule); link = uma_zalloc(rctl_rule_link_zone, M_WAITOK); link->rrl_rule = rule; link->rrl_exceeded = 0; RACCT_LOCK(); LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); RACCT_UNLOCK(); } static int rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); RACCT_LOCK_ASSERT(); link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT); if (link == NULL) return (ENOMEM); rctl_rule_acquire(rule); link->rrl_rule = rule; link->rrl_exceeded = 0; LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); return (0); } /* * Remove limits for a rules matching the filter and release * the refcounts for the rules, possibly freeing them. Returns * the number of limit structures removed. */ static int rctl_racct_remove_rules(struct racct *racct, const struct rctl_rule *filter) { struct rctl_rule_link *link, *linktmp; int removed = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); removed++; } return (removed); } static void rctl_rule_acquire_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_hold(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uihold(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_hold(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_acquire_subject: unknown subject type %d", rule->rr_subject_type); } } static void rctl_rule_release_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_free(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uifree(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_free(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_release_subject: unknown subject type %d", rule->rr_subject_type); } } struct rctl_rule * rctl_rule_alloc(int flags) { struct rctl_rule *rule; ASSERT_RACCT_ENABLED(); rule = uma_zalloc(rctl_rule_zone, flags); if (rule == NULL) return (NULL); rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_resource = RACCT_UNDEFINED; rule->rr_action = RCTL_ACTION_UNDEFINED; rule->rr_amount = RCTL_AMOUNT_UNDEFINED; refcount_init(&rule->rr_refcount, 1); return (rule); } struct rctl_rule * rctl_rule_duplicate(const struct rctl_rule *rule, int flags) { struct rctl_rule *copy; ASSERT_RACCT_ENABLED(); copy = uma_zalloc(rctl_rule_zone, flags); if (copy == NULL) return (NULL); copy->rr_subject_type = rule->rr_subject_type; copy->rr_subject.rs_proc = rule->rr_subject.rs_proc; copy->rr_subject.rs_uip = rule->rr_subject.rs_uip; copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass; copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct; copy->rr_per = rule->rr_per; copy->rr_resource = rule->rr_resource; copy->rr_action = rule->rr_action; copy->rr_amount = rule->rr_amount; refcount_init(©->rr_refcount, 1); rctl_rule_acquire_subject(copy); return (copy); } void rctl_rule_acquire(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); refcount_acquire(&rule->rr_refcount); } static void rctl_rule_free(void *context, int pending) { struct rctl_rule *rule; rule = (struct rctl_rule *)context; ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0")); /* * We don't need locking here; rule is guaranteed to be inaccessible. */ rctl_rule_release_subject(rule); uma_zfree(rctl_rule_zone, rule); } void rctl_rule_release(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); if (refcount_release(&rule->rr_refcount)) { /* * rctl_rule_release() is often called when iterating * over all the uidinfo structures in the system, * holding uihashtbl_lock. Since rctl_rule_free() * might end up calling uifree(), this would lead * to lock recursion. Use taskqueue to avoid this. */ TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule); taskqueue_enqueue(taskqueue_thread, &rule->rr_task); } } static int rctl_rule_fully_specified(const struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: return (0); case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) return (0); break; default: panic("rctl_rule_fully_specified: unknown subject type %d", rule->rr_subject_type); } if (rule->rr_resource == RACCT_UNDEFINED) return (0); if (rule->rr_action == RCTL_ACTION_UNDEFINED) return (0); if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED) return (0); if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED) return (0); return (1); } static int rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) { struct rctl_rule *rule; char *subjectstr, *subject_idstr, *resourcestr, *actionstr, *amountstr, *perstr; id_t id; int error = 0; ASSERT_RACCT_ENABLED(); rule = rctl_rule_alloc(M_WAITOK); subjectstr = strsep(&rulestr, ":"); subject_idstr = strsep(&rulestr, ":"); resourcestr = strsep(&rulestr, ":"); actionstr = strsep(&rulestr, "=/"); amountstr = strsep(&rulestr, "/"); perstr = rulestr; if (subjectstr == NULL || subjectstr[0] == '\0') rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(subjectstr, &rule->rr_subject_type, subjectnames); if (error != 0) goto out; } if (subject_idstr == NULL || subject_idstr[0] == '\0') { rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; } else { switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: error = EINVAL; goto out; case RCTL_SUBJECT_TYPE_PROCESS: error = str2id(subject_idstr, &id); if (error != 0) goto out; sx_assert(&allproc_lock, SA_LOCKED); rule->rr_subject.rs_proc = pfind(id); if (rule->rr_subject.rs_proc == NULL) { error = ESRCH; goto out; } PROC_UNLOCK(rule->rr_subject.rs_proc); break; case RCTL_SUBJECT_TYPE_USER: error = str2id(subject_idstr, &id); if (error != 0) goto out; rule->rr_subject.rs_uip = uifind(id); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: rule->rr_subject.rs_loginclass = loginclass_find(subject_idstr); if (rule->rr_subject.rs_loginclass == NULL) { error = ENAMETOOLONG; goto out; } break; case RCTL_SUBJECT_TYPE_JAIL: rule->rr_subject.rs_prison_racct = prison_racct_find(subject_idstr); if (rule->rr_subject.rs_prison_racct == NULL) { error = ENAMETOOLONG; goto out; } break; default: panic("rctl_string_to_rule: unknown subject type %d", rule->rr_subject_type); } } if (resourcestr == NULL || resourcestr[0] == '\0') rule->rr_resource = RACCT_UNDEFINED; else { error = str2value(resourcestr, &rule->rr_resource, resourcenames); if (error != 0) goto out; } if (actionstr == NULL || actionstr[0] == '\0') rule->rr_action = RCTL_ACTION_UNDEFINED; else { error = str2value(actionstr, &rule->rr_action, actionnames); if (error != 0) goto out; } if (amountstr == NULL || amountstr[0] == '\0') rule->rr_amount = RCTL_AMOUNT_UNDEFINED; else { error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) { if (rule->rr_amount > INT64_MAX / 1000000) { error = ERANGE; goto out; } rule->rr_amount *= 1000000; } } if (perstr == NULL || perstr[0] == '\0') rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(perstr, &rule->rr_per, subjectnames); if (error != 0) goto out; } out: if (error == 0) *rulep = rule; else rctl_rule_release(rule); return (error); } /* * Link a rule with all the subjects it applies to. */ int rctl_rule_add(struct rctl_rule *rule) { struct proc *p; struct ucred *cred; struct uidinfo *uip; struct prison *pr; struct prison_racct *prr; struct loginclass *lc; struct rctl_rule *rule2; int match; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* * Some rules just don't make sense, like "deny" rule for an undeniable * resource. The exception are the RSS and %CPU resources - they are * not deniable in the racct sense, but the limit is enforced in * a different way. */ if (rule->rr_action == RCTL_ACTION_DENY && !RACCT_IS_DENIABLE(rule->rr_resource) && rule->rr_resource != RACCT_RSS && rule->rr_resource != RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && !RACCT_IS_DECAYING(rule->rr_resource)) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && rule->rr_resource == RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && RACCT_IS_SLOPPY(rule->rr_resource)) { return (EOPNOTSUPP); } /* * Make sure there are no duplicated rules. Also, for the "deny" * rules, remove ones differing only by "amount". */ if (rule->rr_action == RCTL_ACTION_DENY) { rule2 = rctl_rule_duplicate(rule, M_WAITOK); rule2->rr_amount = RCTL_AMOUNT_UNDEFINED; rctl_rule_remove(rule2); rctl_rule_release(rule2); } else rctl_rule_remove(rule); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = rule->rr_subject.rs_proc; KASSERT(p != NULL, ("rctl_rule_add: NULL proc")); rctl_racct_add_rule(p->p_racct, rule); /* * In case of per-process rule, we don't have anything more * to do. */ return (0); case RCTL_SUBJECT_TYPE_USER: uip = rule->rr_subject.rs_uip; KASSERT(uip != NULL, ("rctl_rule_add: NULL uip")); rctl_racct_add_rule(uip->ui_racct, rule); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = rule->rr_subject.rs_loginclass; KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass")); rctl_racct_add_rule(lc->lc_racct, rule); break; case RCTL_SUBJECT_TYPE_JAIL: prr = rule->rr_subject.rs_prison_racct; KASSERT(prr != NULL, ("rctl_rule_add: NULL pr")); rctl_racct_add_rule(prr->prr_racct, rule); break; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } /* * Now go through all the processes and add the new rule to the ones * it applies to. */ sx_assert(&allproc_lock, SA_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { cred = p->p_ucred; switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_USER: if (cred->cr_uidinfo == rule->rr_subject.rs_uip || cred->cr_ruidinfo == rule->rr_subject.rs_uip) break; continue; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (cred->cr_loginclass == rule->rr_subject.rs_loginclass) break; continue; case RCTL_SUBJECT_TYPE_JAIL: match = 0; for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) { match = 1; break; } } if (match) break; continue; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } rctl_racct_add_rule(p->p_racct, rule); } return (0); } static void rctl_rule_pre_callback(void) { RACCT_LOCK(); } static void rctl_rule_post_callback(void) { RACCT_UNLOCK(); } static void rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; int found = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); found += rctl_racct_remove_rules(racct, filter); *((int *)arg3) += found; } /* * Remove all rules that match the filter. */ int rctl_rule_remove(struct rctl_rule *filter) { struct proc *p; int found = 0; ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS && filter->rr_subject.rs_proc != NULL) { p = filter->rr_subject.rs_proc; RACCT_LOCK(); found = rctl_racct_remove_rules(p->p_racct, filter); RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } loginclass_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); ui_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); prison_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); sx_assert(&allproc_lock, SA_LOCKED); RACCT_LOCK(); FOREACH_PROC_IN_SYSTEM(p) { found += rctl_racct_remove_rules(p->p_racct, filter); } RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } /* * Appends a rule to the sbuf. */ static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) { int64_t amount; ASSERT_RACCT_ENABLED(); sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type)); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_proc->p_pid); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_uip->ui_uid); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_loginclass->lc_name); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_prison_racct->prr_name); break; default: panic("rctl_rule_to_sbuf: unknown subject type %d", rule->rr_subject_type); } amount = rule->rr_amount; if (amount != RCTL_AMOUNT_UNDEFINED && RACCT_IS_IN_MILLIONS(rule->rr_resource)) amount /= 1000000; sbuf_printf(sb, "%s:%s=%jd", rctl_resource_name(rule->rr_resource), rctl_action_name(rule->rr_action), amount); if (rule->rr_per != rule->rr_subject_type) sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per)); } /* * Routine used by RCTL syscalls to read in input string. */ static int rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen) { char *str; int error; ASSERT_RACCT_ENABLED(); if (inbuflen <= 0) return (EINVAL); if (inbuflen > RCTL_MAX_INBUFSIZE) return (E2BIG); str = malloc(inbuflen + 1, M_RCTL, M_WAITOK); error = copyinstr(inbufp, str, inbuflen, NULL); if (error != 0) { free(str, M_RCTL); return (error); } *inputstr = str; return (0); } /* * Routine used by RCTL syscalls to write out output string. */ static int rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen) { int error; ASSERT_RACCT_ENABLED(); if (outputsbuf == NULL) return (0); sbuf_finish(outputsbuf); if (outbuflen < sbuf_len(outputsbuf) + 1) { sbuf_delete(outputsbuf); return (ERANGE); } error = copyout(sbuf_data(outputsbuf), outbufp, sbuf_len(outputsbuf) + 1); sbuf_delete(outputsbuf); return (error); } static struct sbuf * rctl_racct_to_sbuf(struct racct *racct, int sloppy) { struct sbuf *sb; int64_t amount; int i; ASSERT_RACCT_ENABLED(); sb = sbuf_new_auto(); for (i = 0; i <= RACCT_MAX; i++) { if (sloppy == 0 && RACCT_IS_SLOPPY(i)) continue; RACCT_LOCK(); amount = racct->r_resources[i]; RACCT_UNLOCK(); if (RACCT_IS_IN_MILLIONS(i)) amount /= 1000000; sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); } sbuf_setpos(sb, sbuf_len(sb) - 1); return (sb); } int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { struct rctl_rule *filter; struct sbuf *outputsbuf = NULL; struct proc *p; struct uidinfo *uip; struct loginclass *lc; struct prison_racct *prr; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RACCT); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = filter->rr_subject.rs_proc; if (p == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0); break; case RCTL_SUBJECT_TYPE_USER: uip = filter->rr_subject.rs_uip; if (uip == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = filter->rr_subject.rs_loginclass; if (lc == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1); break; case RCTL_SUBJECT_TYPE_JAIL: prr = filter->rr_subject.rs_prison_racct; if (prr == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1); break; default: error = EINVAL; } out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); if (error != 0) return (error); error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen); return (error); } static void rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; struct rctl_rule_link *link; struct sbuf *sb = (struct sbuf *)arg3; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; struct proc *p; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RULES); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); FOREACH_PROC_IN_SYSTEM(p) { RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { /* * Non-process rules will be added to the buffer later. * Adding them here would result in duplicated output. */ if (link->rrl_rule->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) continue; if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } RACCT_UNLOCK(); } loginclass_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); ui_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); prison_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_LIMITS); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EOPNOTSUPP); } if (filter->rr_subject.rs_proc == NULL) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); RACCT_LOCK(); LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links, rrl_next) { rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } RACCT_UNLOCK(); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; sbuf_delete(sb); goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { struct rctl_rule *rule; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_ADD_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &rule); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } /* * The 'per' part of a rule is optional. */ if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED && rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) rule->rr_per = rule->rr_subject_type; if (!rctl_rule_fully_specified(rule)) { error = EINVAL; goto out; } error = rctl_rule_add(rule); out: rctl_rule_release(rule); sx_sunlock(&allproc_lock); return (error); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { struct rctl_rule *filter; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_REMOVE_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } error = rctl_rule_remove(filter); rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (error); } /* * Update RCTL rule list after credential change. */ void rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred) { LIST_HEAD(, rctl_rule_link) newrules; struct rctl_rule_link *link, *newlink; struct uidinfo *newuip; struct loginclass *newlc; struct prison_racct *newprr; int rulecnt, i; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_NOTOWNED); newuip = newcred->cr_ruidinfo; newlc = newcred->cr_loginclass; newprr = newcred->cr_prison->pr_prison_racct; LIST_INIT(&newrules); again: /* * First, count the rules that apply to the process with new * credentials. */ rulecnt = 0; RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) rulecnt++; } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) rulecnt++; RACCT_UNLOCK(); /* * Create temporary list. We've dropped the rctl_lock in order * to use M_WAITOK. */ for (i = 0; i < rulecnt; i++) { newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK); newlink->rrl_rule = NULL; newlink->rrl_exceeded = 0; LIST_INSERT_HEAD(&newrules, newlink, rrl_next); } newlink = LIST_FIRST(&newrules); /* * Assign rules to the newly allocated list entries. */ RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } if (rulecnt == 0) { /* * Free the old rule list. */ while (!LIST_EMPTY(&p->p_racct->r_rule_links)) { link = LIST_FIRST(&p->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } /* * Replace lists and we're done. * * XXX: Is there any way to switch list heads instead * of iterating here? */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); LIST_INSERT_HEAD(&p->p_racct->r_rule_links, newlink, rrl_next); } RACCT_UNLOCK(); return; } goaround: RACCT_UNLOCK(); /* * Rule list changed while we were not holding the rctl_lock. * Free the new list and try again. */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); if (newlink->rrl_rule != NULL) rctl_rule_release(newlink->rrl_rule); uma_zfree(rctl_rule_link_zone, newlink); } goto again; } /* * Assign RCTL rules to the newly created process. */ int rctl_proc_fork(struct proc *parent, struct proc *child) { struct rctl_rule *rule; struct rctl_rule_link *link; int error; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent)); LIST_INIT(&child->p_racct->r_rule_links); /* * Go through limits applicable to the parent and assign them * to the child. Rules with 'process' subject have to be duplicated * in order to make their rr_subject point to the new process. */ LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT); if (rule == NULL) goto fail; KASSERT(rule->rr_subject.rs_proc == parent, ("rule->rr_subject.rs_proc != parent")); rule->rr_subject.rs_proc = child; error = rctl_racct_add_rule_locked(child->p_racct, rule); rctl_rule_release(rule); if (error != 0) goto fail; } else { error = rctl_racct_add_rule_locked(child->p_racct, link->rrl_rule); if (error != 0) goto fail; } } return (0); fail: while (!LIST_EMPTY(&child->p_racct->r_rule_links)) { link = LIST_FIRST(&child->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } return (EAGAIN); } /* * Release rules attached to the racct. */ void rctl_racct_release(struct racct *racct) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); while (!LIST_EMPTY(&racct->r_rule_links)) { link = LIST_FIRST(&racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } } static void rctl_init(void) { if (!racct_enable) return; rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rctl_rule_link_zone = uma_zcreate("rctl_rule_link", sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Set default values, making sure not to overwrite the ones * fetched from tunables. Most of those could be set at the * declaration, except for the rctl_throttle_max - we cannot * set it there due to hz not being compile time constant. */ if (rctl_throttle_min < 1) rctl_throttle_min = 1; if (rctl_throttle_max < rctl_throttle_min) rctl_throttle_max = 2 * hz; if (rctl_throttle_pct < 0) rctl_throttle_pct = 100; if (rctl_throttle_pct2 < 0) rctl_throttle_pct2 = 100; } #else /* !RCTL */ int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { return (ENOSYS); } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { return (ENOSYS); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { return (ENOSYS); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { return (ENOSYS); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { return (ENOSYS); } #endif /* !RCTL */ Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 364441) +++ head/sys/kern/kern_sig.c (revision 364442) @@ -1,4164 +1,4164 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include -#include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); SDT_PROBE_DEFINE2(proc, , , signal__clear, "int", "ksiginfo_t *"); SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static void reschedule_signals(struct proc *p, sigset_t block, int flags); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static int sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock); static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); static int kern_lognosys = 0; SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0, "Log invalid syscalls"); __read_frequently bool sigfastblock_fetch_always = false; SYSCTL_BOOL(_kern, OID_AUTO, sigfastblock_fetch_always, CTLFLAG_RWTUN, &sigfastblock_fetch_always, 0, "Fetch sigfastblock word on each syscall entry for proper " "blocking semantic"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SIGPROP_KILL 0x01 /* terminates process by default */ #define SIGPROP_CORE 0x02 /* ditto and coredumps */ #define SIGPROP_STOP 0x04 /* suspend process */ #define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ #define SIGPROP_IGNORE 0x10 /* ignore by default */ #define SIGPROP_CONT 0x20 /* continue if suspended */ #define SIGPROP_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { [SIGHUP] = SIGPROP_KILL, [SIGINT] = SIGPROP_KILL, [SIGQUIT] = SIGPROP_KILL | SIGPROP_CORE, [SIGILL] = SIGPROP_KILL | SIGPROP_CORE, [SIGTRAP] = SIGPROP_KILL | SIGPROP_CORE, [SIGABRT] = SIGPROP_KILL | SIGPROP_CORE, [SIGEMT] = SIGPROP_KILL | SIGPROP_CORE, [SIGFPE] = SIGPROP_KILL | SIGPROP_CORE, [SIGKILL] = SIGPROP_KILL, [SIGBUS] = SIGPROP_KILL | SIGPROP_CORE, [SIGSEGV] = SIGPROP_KILL | SIGPROP_CORE, [SIGSYS] = SIGPROP_KILL | SIGPROP_CORE, [SIGPIPE] = SIGPROP_KILL, [SIGALRM] = SIGPROP_KILL, [SIGTERM] = SIGPROP_KILL, [SIGURG] = SIGPROP_IGNORE, [SIGSTOP] = SIGPROP_STOP, [SIGTSTP] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGCONT] = SIGPROP_IGNORE | SIGPROP_CONT, [SIGCHLD] = SIGPROP_IGNORE, [SIGTTIN] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGTTOU] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGIO] = SIGPROP_IGNORE, [SIGXCPU] = SIGPROP_KILL, [SIGXFSZ] = SIGPROP_KILL, [SIGVTALRM] = SIGPROP_KILL, [SIGPROF] = SIGPROP_KILL, [SIGWINCH] = SIGPROP_IGNORE, [SIGINFO] = SIGPROP_IGNORE, [SIGUSR1] = SIGPROP_KILL, [SIGUSR2] = SIGPROP_KILL, }; sigset_t fastblock_mask; static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); SIGFILLSET(fastblock_mask); SIG_CANTMASK(fastblock_mask); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); SIGEMPTYSET(list->sq_ptrace); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_ptrace, signo)) { count++; SIGDELSET(sq->sq_ptrace, signo); si->ksi_flags |= KSI_PTRACE; } if (SIGISMEMBER(sq->sq_kill, signo)) { count++; if (count == 1) SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) && !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); /* * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path * for these signals. */ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if (ret != 0) { if ((si->ksi_flags & KSI_PTRACE) != 0) { SIGADDSET(sq->sq_ptrace, signo); ret = 0; goto out_set_bit; } else if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } return (ret); } out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); SIGEMPTYSET(sq->sq_ptrace); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_ptrace; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_ptrace, tmp); SIGSETNAND(src->sq_ptrace, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_ptrace, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } } /* * Returns 1 (true) if altstack is configured for the thread, and the * passed stack bottom address falls into the altstack range. Handles * the 43 compat special case where the alt stack size is zero. */ int sigonstack(size_t sp) { struct thread *td; td = curthread; if ((td->td_pflags & TDP_ALTSTACK) == 0) return (0); #if defined(COMPAT_43) if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && td->td_sigstk.ss_size == 0) return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0); #endif return (sp >= (size_t)td->td_sigstk.ss_sp && sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp); } static __inline int sigprop(int sig) { if (sig > 0 && sig < nitems(sigproptbl)) return (sigproptbl[sig]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { memset(oact, 0, sizeof(*oact)); oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(struct thread *td, struct sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(struct thread *td, struct osigaction_args *uap) { struct osigaction sa; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(struct thread *td, struct osigreturn_args *uap) { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct proc *p) { int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { sigset_t osigignore; struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig_drop_caught(p); /* * As CloudABI processes cannot modify signal handlers, fully * reset all signals to their default behavior. Do ignore * SIGPIPE, as it would otherwise be impossible to recover from * writes to broken pipes and sockets. */ if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) { osigignore = ps->ps_sigignore; while (SIGNOTEMPTY(osigignore)) { sig = sig_ffs(&osigignore); SIGDELSET(osigignore, sig); if (sig != SIGPIPE) sigdflt(ps, sig); } SIGADDSET(ps->ps_sigignore, SIGPIPE); } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td = curthread; MPASS(td->td_proc == p); td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap) { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(struct thread *td, struct osigprocmask_args *uap) { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) error = ERESTART; if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } static void proc_td_siginfo_capture(struct thread *td, siginfo_t *si) { struct thread *thr; FOREACH_THREAD_IN_PROC(td->td_proc, thr) { if (thr == td) thr->td_si = *si; else thr->td_si.si_signo = 0; } } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timo, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; bool traced; p = td->td_proc; error = 0; ets.tv_sec = 0; ets.tv_nsec = 0; traced = false; if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); timespecadd(&rts, timeout, &ets); } } ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); KASSERT(sig >= 0, ("sig %d", sig)); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL) { if (!timevalid) { error = EINVAL; break; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; break; } timespecsub(&ets, &rts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); timo = tvtohz(&tv); } else { timo = 0; } if (traced) { error = EINTR; break; } error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo); if (timeout != NULL) { if (error == ERESTART) { /* Timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* We will calculate timeout by ourself. */ error = 0; } } /* * If PTRACE_SCE or PTRACE_SCX were set after * userspace entered the syscall, return spurious * EINTR after wait was done. Only do this as last * resort after rechecking for possible queued signals * and expired timeouts. */ if (error == 0 && (p->p_ptevents & PTRACE_SYSCALL) != 0) traced = true; } new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) { proc_td_siginfo_capture(td, &ksi->ksi_info); sigexit(td, sig); } } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(struct thread *td, struct sigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(struct thread *td, struct osigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(struct thread *td, struct osigvec_args *uap) { struct sigvec vec; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(struct thread *td, struct osigblock_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(struct thread *td, struct osigsetmask_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap) { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); has_sig += postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); /* * If PTRACE_SCE or PTRACE_SCX were set after * userspace entered the syscall, return spurious * EINTR. */ if ((p->p_ptevents & PTRACE_SYSCALL) != 0) has_sig += 1; } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(struct thread *td, struct osigsuspend_args *uap) { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(struct thread *td, struct osigstack_args *uap) { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap) { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } struct killpg1_ctx { struct thread *td; ksiginfo_t *ksi; int sig; bool sent; bool found; int ret; }; static void killpg1_sendsig(struct proc *p, bool notself, struct killpg1_ctx *arg) { int err; if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 || (notself && p == arg->td->td_proc) || p->p_state == PRS_NEW) return; PROC_LOCK(p); err = p_cansignal(arg->td, p, arg->sig); if (err == 0 && arg->sig != 0) pksignal(p, arg->sig, arg->ksi); PROC_UNLOCK(p); if (err != ESRCH) arg->found = true; if (err == 0) arg->sent = true; else if (arg->ret == 0 && err != ESRCH && err != EPERM) arg->ret = err; } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; struct killpg1_ctx arg; arg.td = td; arg.ksi = ksi; arg.sig = sig; arg.sent = false; arg.found = false; arg.ret = 0; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { killpg1_sendsig(p, true, &arg); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { killpg1_sendsig(p, false, &arg); } PGRP_UNLOCK(pgrp); } MPASS(arg.ret != 0 || arg.found || !arg.sent); if (arg.ret == 0 && !arg.sent) arg.ret = arg.found ? EPERM : ESRCH; return (arg.ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { return (kern_kill(td, uap->pid, uap->signum)); } int kern_kill(struct thread *td, pid_t pid, int signum) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (IN_CAPABILITY_MODE(td) && pid != td->td_proc->p_pid) return (ECAPMODE); AUDIT_ARG_SIGNUM(signum); AUDIT_ARG_PID(pid); if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (pid > 0) { /* kill single process */ if ((p = pfind_any(pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, signum); if (error == 0 && signum) pksignal(p, signum, &ksi); PROC_UNLOCK(p); return (error); } switch (pid) { case -1: /* broadcast signal */ return (killpg1(td, signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, signum, -pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(struct thread *td, struct pdkill_args *uap) { struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { union sigval sv; sv.sival_ptr = uap->value; return (kern_sigqueue(td, uap->pid, uap->signum, &sv)); } int kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (pid <= 0) return (EINVAL); if ((p = pfind_any(pid)) == NULL) return (ESRCH); error = p_cansignal(td, p, signum); if (error == 0 && signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value = *value; error = pksignal(p, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(int pgid, int sig, ksiginfo_t *ksi) { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; sigset_t sigmask; int code, sig; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); sigfastblock_fetch(td); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sigmask = td->td_sigmask; if (td->td_sigblock_val != 0) SIGSETOR(sigmask, fastblock_mask); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; td->td_pflags &= ~TDP_SIGFASTBLOCK; td->td_sigblock_val = 0; } mtx_unlock(&ps->ps_mtx); p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, bool fast_sigblock) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(!fast_sigblock || p == curproc); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig) && (!fast_sigblock || curthread->td_sigblock_val == 0)) return (curthread); signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig) && (!fast_sigblock || td != curthread || td->td_sigblock_val == 0)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, false); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SIGPROP_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); wakeup_swapper = 0; /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out; } if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ PROC_SLOCK(p); thread_lock(td); if (TD_CAN_ABORT(td)) wakeup_swapper = sleepq_abort(td, intrval); else thread_unlock(td); PROC_SUNLOCK(p); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); wakeup_swapper = sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); if (wakeup_swapper) kick_proc0(); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; int prop, wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sleepq_abort(td, intrval); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); return; } /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif out: PROC_SUNLOCK(p); thread_unlock(td); } static int sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); MPASS(sending || td == curthread); wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY and without TDF_SERESTART * or TDF_SEINTR set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); if (TD_SBDRY_INTR(td2)) { wakeup_swapper |= sleepq_abort(td2, TD_SBDRY_ERRNO(td2)); continue; } } else if (!TD_IS_SUSPENDED(td2)) thread_suspend_one(td2); } else if (!TD_IS_SUSPENDED(td2)) { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } return (wakeup_swapper); } /* * Stop the process for an event deemed interesting to the debugger. If si is * non-NULL, this is a signal exchange; the new signal requested by the * debugger will be returned for handling. If si is NULL, this is some other * type of interesting event. The debugger may request a signal be delivered in * that case as well, however it will be deferred until it can be handled. */ int ptracestop(struct thread *td, int sig, ksiginfo_t *si) { struct proc *p = td->td_proc; struct thread *td2; ksiginfo_t ksi; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_xsig = sig; if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) { td->td_dbgflags |= TDB_XSIG; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (P_KILLED(p)) { /* * Ensure that, if we've been PT_KILLed, the * exit status reflects that. Another thread * may also be in ptracestop(), having just * received the SIGKILL, but this thread was * unsuspended first. */ td->td_dbgflags &= ~TDB_XSIG; td->td_xsig = SIGKILL; p->p_ptevents = 0; break; } if (p->p_flag & P_SINGLE_EXIT && !(td->td_dbgflags & TDB_EXIT)) { /* * Ignore ptrace stops except for thread exit * events when the process exits. */ td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (0); } /* * Make wait(2) work. Ensure that right after the * attach, the thread which was decided to become the * leader of attach gets reported to the waiter. * Otherwise, just avoid overwriting another thread's * assignment to p_xthread. If another thread has * already set p_xthread, the current thread will get * a chance to report itself upon the next iteration. */ if ((td->td_dbgflags & TDB_FSTP) != 0 || ((p->p_flag2 & P2_PTRACE_FSTP) == 0 && p->p_xthread == NULL)) { p->p_xsig = sig; p->p_xthread = td; /* * If we are on sleepqueue already, * let sleepqueue code decide if it * needs to go sleep after attach. */ if (td->td_wchan == NULL) td->td_dbgflags &= ~TDB_FSTP; p->p_flag2 &= ~P2_PTRACE_FSTP; p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE; sig_suspend_threads(td, p, 0); } if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; } stopme: thread_suspend_switch(td, p); if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); } if (si != NULL && sig == td->td_xsig) { /* Parent wants us to take the original signal unchanged. */ si->ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, si) != 0) si->ksi_signo = 0; } else if (td->td_xsig != 0) { /* * If parent wants us to take a new signal, then it will leave * it in td->td_xsig; otherwise we just look for signals again. */ ksiginfo_init(&ksi); ksi.ksi_signo = td->td_xsig; ksi.ksi_flags |= KSI_PTRACE; td2 = sigtd(p, td->td_xsig, false); tdsendsignal(p, td2, td->td_xsig, &ksi); if (td != td2) return (0); } return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; bool fastblk, pslocked; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; pslocked = (flags & SIGPROCMASK_PS_LOCKED) != 0; mtx_assert(&ps->ps_mtx, pslocked ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); fastblk = (flags & SIGPROCMASK_FASTBLK) != 0; while ((sig = sig_ffs(&block)) != 0) { SIGDELSET(block, sig); td = sigtd(p, sig, fastblk); /* * If sigtd() selected us despite sigfastblock is * blocking, do not activate AST or wake us, to avoid * loop in AST handler. */ if (fastblk && td == curthread) continue; signotify(td); if (!pslocked) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || (SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig))) { tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); } if (!pslocked) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } static int sigdeferstop_curr_flags(int cflags) { MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 || (cflags & TDF_SBDRY) != 0); return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)); } /* * Defer the delivery of SIGSTOP for the current thread, according to * the requested mode. Returns previous flags, which must be restored * by sigallowstop(). * * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and * cleared by the current thread, which allow the lock-less read-only * accesses below. */ int sigdeferstop_impl(int mode) { struct thread *td; int cflags, nflags; td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); switch (mode) { case SIGDEFERSTOP_NOP: nflags = cflags; break; case SIGDEFERSTOP_OFF: nflags = 0; break; case SIGDEFERSTOP_SILENT: nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART); break; case SIGDEFERSTOP_EINTR: nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART; break; case SIGDEFERSTOP_ERESTART: nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR; break; default: panic("sigdeferstop: invalid mode %x", mode); break; } if (cflags == nflags) return (SIGDEFERSTOP_VAL_NCHG); thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | nflags; thread_unlock(td); return (cflags); } /* * Restores the STOP handling mode, typically permitting the delivery * of SIGSTOP for the current thread. This does not immediately * suspend if a stop was posted. Instead, the thread will suspend * either via ast() or a subsequent interruptible sleep. */ void sigallowstop_impl(int prev) { struct thread *td; int cflags; KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop")); KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("sigallowstop: incorrect previous mode %x", prev)); td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); if (cflags != prev) { thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | prev; thread_unlock(td); } } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; sigset_t sigpending; ksiginfo_t ksi; int prop, sig; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); /* * Do fast sigblock if requested by usermode. Since * we do know that there was a signal pending at this * point, set the FAST_SIGBLOCK_PEND as indicator for * usermode to perform a dummy call to * FAST_SIGBLOCK_UNBLOCK, which causes immediate * delivery of postponed pending signal. */ if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) { if (td->td_sigblock_val != 0) SIGSETNAND(sigpending, fastblock_mask); if (SIGISEMPTY(sigpending)) { td->td_pflags |= TDP_SIGFASTPENDING; return (0); } } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED && (p->p_flag2 & P2_PTRACE_FSTP) != 0 && SIGISMEMBER(sigpending, SIGSTOP)) { /* * If debugger just attached, always consume * SIGSTOP from ptrace(PT_ATTACH) first, to * execute the debugger attach ritual in * order. */ sig = SIGSTOP; td->td_dbgflags |= TDB_FSTP; } else { sig = sig_ffs(&sigpending); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (p->p_flag & P_TRACED) == 0) { sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); continue; } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; ksiginfo_init(&ksi); if (sigqueue_get(queue, sig, &ksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &ksi); } td->td_si = ksi.ksi_info; mtx_unlock(&ps->ps_mtx); sig = ptracestop(td, sig, &ksi); mtx_lock(&ps->ps_mtx); td->td_si.si_signo = 0; /* * Keep looking if the debugger discarded or * replaced the signal. */ if (sig == 0) continue; /* * If the signal became masked, re-queue it. */ if (SIGISMEMBER(td->td_sigmask, sig)) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(&p->p_sigqueue, sig, &ksi); continue; } /* * If the traced bit got turned off, requeue * the signal and go back up to the top to * rescan signals. This ensures that p_sig* * and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(queue, sig, &ksi); continue; } } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * Traced or exiting processes should ignore stops. * Additionally, a member of an orphaned process group * should ignore tty stops. */ if (prop & SIGPROP_STOP) { if (p->p_flag & (P_TRACED | P_WEXIT | P_SINGLE_EXIT) || (p->p_pgrp->pg_jobc == 0 && prop & SIGPROP_TTYSTOP)) break; /* == ignore */ if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); return (-1); } mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); goto next; } else if (prop & SIGPROP_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SIGPROP_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ sigqueue_delete(&p->p_sigqueue, sig); next:; } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(int sig) { struct thread *td; struct proc *p; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); proc_td_siginfo_capture(td, &ksi.ksi_info); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN, ("postsig action %p", action)); KASSERT(!SIGISMEMBER(td->td_sigmask, sig), ("postsig action: blocked sig %d", sig)); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } void proc_wkilled(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_WKILLED) == 0) { p->p_flag |= P_WKILLED; /* * Notify swapper that there is a process to swap in. * The notification is racy, at worst it would take 10 * seconds for the swapper process to notice. */ if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0) wakeup(&proc0); } } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, const char *why) { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, p->p_ucred->cr_uid, why); proc_wkilled(p); kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SIGPROP_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), jid %d, uid %d: exited on " "signal %d%s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, td->td_ucred->cr_uid, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } #define MAX_NUM_CORE_FILES 100000 #ifndef NUM_CORE_FILES #define NUM_CORE_FILES 5 #endif CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); static int num_cores = NUM_CORE_FILES; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORE_FILES) new_val = MAX_NUM_CORE_FILES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_debug_num_cores_check, "I", "Maximum number of generated process corefiles while using index format"); #define GZIP_SUFFIX ".gz" #define ZSTD_SUFFIX ".zst" int compress_user_cores = 0; static int sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) { int error, val; val = compress_user_cores; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && !compressor_avail(val)) return (EINVAL); compress_user_cores = val; return (error); } SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_compress_user_cores, "I", "Enable compression of user corefiles (" __XSTRING(COMPRESS_GZIP) " = gzip, " __XSTRING(COMPRESS_ZSTD) " = zstd)"); int compress_user_cores_level = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, &compress_user_cores_level, 0, "Corefile compression level"); /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); static void vnode_close_locked(struct thread *td, struct vnode *vp) { VOP_UNLOCK(vp); vn_close(vp, FWRITE, td->td_ucred, td); } /* * If the core format has a %I in it, then we need to check * for existing corefiles before defining a name. * To do this we iterate over 0..ncores to find a * non-existing core file name to use. If all core files are * already used we choose the oldest one. */ static int corefile_open_last(struct thread *td, char *name, int indexpos, int indexlen, int ncores, struct vnode **vpp) { struct vnode *oldvp, *nextvp, *vp; struct vattr vattr; struct nameidata nd; int error, i, flags, oflags, cmode; char ch; struct timespec lasttime; nextvp = oldvp = NULL; cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); for (i = 0; i < ncores; i++) { flags = O_CREAT | FWRITE | O_NOFOLLOW; ch = name[indexpos + indexlen]; (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, i); name[indexpos + indexlen] = ch; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error != 0) break; vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if ((flags & O_CREAT) == O_CREAT) { nextvp = vp; break; } error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) { vnode_close_locked(td, vp); break; } if (oldvp == NULL || lasttime.tv_sec > vattr.va_mtime.tv_sec || (lasttime.tv_sec == vattr.va_mtime.tv_sec && lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { if (oldvp != NULL) vn_close(oldvp, FWRITE, td->td_ucred, td); oldvp = vp; VOP_UNLOCK(oldvp); lasttime = vattr.va_mtime; } else { vnode_close_locked(td, vp); } } if (oldvp != NULL) { if (nextvp == NULL) { if ((td->td_proc->p_flag & P_SUGID) != 0) { error = EFAULT; vn_close(oldvp, FWRITE, td->td_ucred, td); } else { nextvp = oldvp; error = vn_lock(nextvp, LK_EXCLUSIVE); if (error != 0) { vn_close(nextvp, FWRITE, td->td_ucred, td); nextvp = NULL; } } } else { vn_close(oldvp, FWRITE, td->td_ucred, td); } } if (error != 0) { if (nextvp != NULL) vnode_close_locked(td, oldvp); } else { *vpp = nextvp; } return (error); } /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, int signum, struct vnode **vpp, char **namep) { struct sbuf sb; struct nameidata nd; const char *format; char *hostname, *name; int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexlen = 0; indexpos = -1; ncores = num_cores; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; case 'I': /* autoincrementing index */ if (indexpos != -1) { sbuf_printf(&sb, "%%I"); break; } indexpos = sbuf_len(&sb); sbuf_printf(&sb, "%u", ncores - 1); indexlen = sbuf_len(&sb) - indexpos; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'S': /* signal number */ sbuf_printf(&sb, "%i", signum); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress == COMPRESS_GZIP) sbuf_printf(&sb, GZIP_SUFFIX); else if (compress == COMPRESS_ZSTD) sbuf_printf(&sb, ZSTD_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); if (indexpos != -1) { error = corefile_open_last(td, name, indexpos, indexlen, ncores, vpp); if (error != 0) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } } else { cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); flags = O_CREAT | FWRITE | O_NOFOLLOW; if ((td->td_proc->p_flag & P_SUGID) != 0) flags |= O_EXCL; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error == 0) { *vpp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); } } if (error != 0) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } *namep = name; return (0); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; size_t fullpathsize; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *fullpath, *freepath = NULL; struct sbuf *sb; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, p->p_sig, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. Effective user must own the corefile. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 || vattr.va_uid != cred->cr_uid) { VOP_UNLOCK(vp); error = EFAULT; goto out; } VOP_UNLOCK(vp); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; sb = sbuf_new_auto(); if (vn_fullpath_global(td, p->p_textvp, &fullpath, &freepath) != 0) goto out2; sbuf_printf(sb, "comm=\""); devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_printf(sb, "\" core=\""); /* * We can't lookup core file vp directly. When we're replacing a core, and * other random times, we flush the name cache, so it will fail. Instead, * if the path of the core is relative, add the current dir in front if it. */ if (name[0] != '/') { fullpathsize = MAXPATHLEN; freepath = malloc(fullpathsize, M_TEMP, M_WAITOK); if (vn_getcwd(td, freepath, &fullpath, &fullpathsize) != 0) { free(freepath, M_TEMP); goto out2; } devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_putc(sb, '/'); } devctl_safe_quote_sb(sb, name); sbuf_printf(sb, "\""); if (sbuf_finish(sb) == 0) devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); out2: sbuf_delete(sb); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(struct thread *td, struct nosys_args *args) { struct proc *p; p = td->td_proc; PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); if (kern_lognosys == 1 || kern_lognosys == 3) { uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } if (kern_lognosys == 2 || kern_lognosys == 3) { printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio **sigiop, int sig, int checkctty) { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } void sig_drop_caught(struct proc *p) { int sig; struct sigacts *ps; ps = p->p_sigacts; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&ps->ps_mtx, MA_OWNED); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); sigdflt(ps, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } } static void sigfastblock_failed(struct thread *td, bool sendsig, bool write) { ksiginfo_t ksi; /* * Prevent further fetches and SIGSEGVs, allowing thread to * issue syscalls despite corruption. */ sigfastblock_clear(td); if (!sendsig) return; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGSEGV; ksi.ksi_code = write ? SEGV_ACCERR : SEGV_MAPERR; ksi.ksi_addr = td->td_sigblock_ptr; trapsignal(td, &ksi); } static bool sigfastblock_fetch_sig(struct thread *td, bool sendsig, uint32_t *valp) { uint32_t res; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return (true); if (fueword32((void *)td->td_sigblock_ptr, &res) == -1) { sigfastblock_failed(td, sendsig, false); return (false); } *valp = res; td->td_sigblock_val = res & ~SIGFASTBLOCK_FLAGS; return (true); } static void sigfastblock_resched(struct thread *td, bool resched) { struct proc *p; if (resched) { p = td->td_proc; PROC_LOCK(p); reschedule_signals(p, td->td_sigmask, 0); PROC_UNLOCK(p); } thread_lock(td); td->td_flags |= TDF_ASTPENDING | TDF_NEEDSIGCHK; thread_unlock(td); } int sys_sigfastblock(struct thread *td, struct sigfastblock_args *uap) { struct proc *p; int error, res; uint32_t oldval; error = 0; p = td->td_proc; switch (uap->cmd) { case SIGFASTBLOCK_SETPTR: if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) { error = EBUSY; break; } if (((uintptr_t)(uap->ptr) & (sizeof(uint32_t) - 1)) != 0) { error = EINVAL; break; } td->td_pflags |= TDP_SIGFASTBLOCK; td->td_sigblock_ptr = uap->ptr; break; case SIGFASTBLOCK_UNBLOCK: if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) { error = EINVAL; break; } for (;;) { res = casueword32(td->td_sigblock_ptr, SIGFASTBLOCK_PEND, &oldval, 0); if (res == -1) { error = EFAULT; sigfastblock_failed(td, false, true); break; } if (res == 0) break; MPASS(res == 1); if (oldval != SIGFASTBLOCK_PEND) { error = EBUSY; break; } error = thread_check_susp(td, false); if (error != 0) break; } if (error != 0) break; /* * td_sigblock_val is cleared there, but not on a * syscall exit. The end effect is that a single * interruptible sleep, while user sigblock word is * set, might return EINTR or ERESTART to usermode * without delivering signal. All further sleeps, * until userspace clears the word and does * sigfastblock(UNBLOCK), observe current word and no * longer get interrupted. It is slight * non-conformance, with alternative to have read the * sigblock word on each syscall entry. */ td->td_sigblock_val = 0; /* * Rely on normal ast mechanism to deliver pending * signals to current thread. But notify others about * fake unblock. */ sigfastblock_resched(td, error == 0 && p->p_numthreads != 1); break; case SIGFASTBLOCK_UNSETPTR: if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) { error = EINVAL; break; } if (!sigfastblock_fetch_sig(td, false, &oldval)) { error = EFAULT; break; } if (oldval != 0 && oldval != SIGFASTBLOCK_PEND) { error = EBUSY; break; } sigfastblock_clear(td); break; default: error = EINVAL; break; } return (error); } void sigfastblock_clear(struct thread *td) { bool resched; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return; td->td_sigblock_val = 0; resched = (td->td_pflags & TDP_SIGFASTPENDING) != 0 || SIGPENDING(td); td->td_pflags &= ~(TDP_SIGFASTBLOCK | TDP_SIGFASTPENDING); sigfastblock_resched(td, resched); } void sigfastblock_fetch(struct thread *td) { uint32_t val; (void)sigfastblock_fetch_sig(td, true, &val); } static void sigfastblock_setpend1(struct thread *td) { int res; uint32_t oldval; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return; res = fueword32((void *)td->td_sigblock_ptr, &oldval); if (res == -1) { sigfastblock_failed(td, true, false); return; } for (;;) { res = casueword32(td->td_sigblock_ptr, oldval, &oldval, oldval | SIGFASTBLOCK_PEND); if (res == -1) { sigfastblock_failed(td, true, true); return; } if (res == 0) { td->td_sigblock_val = oldval & ~SIGFASTBLOCK_FLAGS; td->td_pflags &= ~TDP_SIGFASTPENDING; break; } MPASS(res == 1); if (thread_check_susp(td, false) != 0) break; } } void sigfastblock_setpend(struct thread *td, bool resched) { struct proc *p; sigfastblock_setpend1(td); if (resched) { p = td->td_proc; PROC_LOCK(p); reschedule_signals(p, fastblock_mask, SIGPROCMASK_FASTBLK); PROC_UNLOCK(p); } } Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c (revision 364441) +++ head/sys/kern/vfs_mount.c (revision 364442) @@ -1,2503 +1,2503 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, uint64_t fsflags, struct vfsoptlist **optlist); static void free_mntarg(struct mntarg *ma); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); static bool default_autoro = false; SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); EVENTHANDLER_LIST_DEFINE(vfs_mounted); EVENTHANDLER_LIST_DEFINE(vfs_unmounted); static void mount_devctl_event(const char *type, struct mount *mp, bool donew); /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", NULL }; static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); mp->mnt_ref = 0; mp->mnt_vfs_ops = 1; mp->mnt_rootvnode = NULL; return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; uma_zfree_pcpu(pcpu_zone_int, mp->mnt_writeopcount_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_lockref_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_ref_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); } static void vfs_mount_init(void *dummy __unused) { mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); } SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; if (opts == NULL) return; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } static int vfs_isopt_ro(const char *opt) { if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || strcmp(opt, "norw") == 0) return (1); return (0); } static int vfs_isopt_rw(const char *opt) { if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) return (1); return (0); } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { char *p; /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); while ((p = strchr(opt1, '.')) != NULL && !strncmp(opt1, opt2, ++p - opt1)) { opt2 += p - opt1; opt1 = p; /* "foo.noopt" vs. "foo.opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "foo.opt" vs. "foo.noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); } /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurrence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused, namelen, optlen; unsigned int i, iovcnt; int error; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; opt->pos = i / 2; opt->seen = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (namelen == 0 || opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. * * XXX: This function will keep a "nofoo" option in the new * options. E.g, if the option's canonical name is "foo", * "nofoo" ends up in the mount point's active options. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) { struct vfsopt *opt, *new; TAILQ_FOREACH(opt, oldopts, link) { new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = strdup(opt->name, M_MOUNT); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else new->value = NULL; new->len = opt->len; new->seen = opt->seen; TAILQ_INSERT_HEAD(toopts, new, link); } vfs_sanitizeopts(toopts); } /* * Mount a filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct nmount_args { struct iovec *iovp; unsigned int iovcnt; int flags; }; #endif int sys_nmount(struct thread *td, struct nmount_args *uap) { struct uio *auio; int error; u_int iovcnt; uint64_t flags; /* * Mount flags are now 64-bits. On 32-bit archtectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, uap->iovp, uap->iovcnt, flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) { CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, uap->iovcnt); return (EINVAL); } error = copyinuio(uap->iovp, iovcnt, &auio); if (error) { CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", __func__, error); return (error); } error = vfs_donmount(td, flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { vfs_mp_count_add_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct ucred *cred) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; TAILQ_INIT(&mp->mnt_lazyvnodelist); mp->mnt_lazyvnodelistsize = 0; if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || mp->mnt_writeopcount != 0) panic("%s: non-zero counters on new mp %p\n", __func__, mp); if (mp->mnt_vfs_ops != 1) panic("%s: vfs_ops should be 1 but %d found\n", __func__, mp->mnt_vfs_ops); (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(cred); mp->mnt_stat.f_owner = cred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); TAILQ_INIT(&mp->mnt_uppers); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { if (mp->mnt_vfs_ops == 0) panic("%s: entered with zero vfs_ops\n", __func__); vfs_assert_mount_counters(mp); MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } while (mp->mnt_ref) msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); KASSERT(mp->mnt_ref == 0, ("%s: invalid refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_lazyvnodelistsize != 0) panic("vfs_mount_destroy: nonzero lazyvnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); if (mp->mnt_vfs_ops != 1) panic("%s: vfs_ops should be 1 but %d found\n", __func__, mp->mnt_vfs_ops); if (mp->mnt_rootvnode != NULL) panic("%s: mount point still has a root vnode %p\n", __func__, mp->mnt_rootvnode); if (mp->mnt_vnodecovered != NULL) vrele(mp->mnt_vnodecovered); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } static bool vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error) { /* This is an upgrade of an exisiting mount. */ if ((fsflags & MNT_UPDATE) != 0) return (false); /* This is already an R/O mount. */ if ((fsflags & MNT_RDONLY) != 0) return (false); switch (error) { case ENODEV: /* generic, geom, ... */ case EACCES: /* cam/scsi, ... */ case EROFS: /* md, mmcsd, ... */ /* * These errors can be returned by the storage layer to signal * that the media is read-only. No harm in the R/O mount * attempt if the error was returned for some other reason. */ return (true); default: return (false); } } int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *tmp_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; bool autoro; errmsg = fspath = NULL; errmsg_len = fspathlen = 0; errmsg_pos = -1; autoro = default_autoro; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { int do_freeopt = 0; if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; do_freeopt = 1; } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; do_freeopt = 1; } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; do_freeopt = 1; } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) { fsflags &= ~MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "rw") == 0) { fsflags &= ~MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "ro") == 0) { fsflags |= MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "autoro") == 0) { do_freeopt = 1; autoro = true; } else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; else if (strcmp(opt->name, "automounted") == 0) { fsflags |= MNT_AUTOMOUNTED; do_freeopt = 1; } else if (strcmp(opt->name, "nocover") == 0) { fsflags |= MNT_NOCOVER; do_freeopt = 1; } else if (strcmp(opt->name, "cover") == 0) { fsflags &= ~MNT_NOCOVER; do_freeopt = 1; } else if (strcmp(opt->name, "emptydir") == 0) { fsflags |= MNT_EMPTYDIR; do_freeopt = 1; } else if (strcmp(opt->name, "noemptydir") == 0) { fsflags &= ~MNT_EMPTYDIR; do_freeopt = 1; } if (do_freeopt) vfs_freeopt(optlist, opt); } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { error = ENAMETOOLONG; goto bail; } error = vfs_domount(td, fstype, fspath, fsflags, &optlist); /* * See if we can mount in the read-only mode if the error code suggests * that it could be possible and the mount options allow for that. * Never try it if "[no]{ro|rw}" has been explicitly requested and not * overridden by "autoro". */ if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) { printf("%s: R/W mount failed, possibly R/O media," " trying R/O mount\n", __func__); fsflags |= MNT_RDONLY; error = vfs_domount(td, fstype, fspath, fsflags, &optlist); } bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (optlist != NULL) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int sys_mount(struct thread *td, struct mount_args *uap) { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; uint64_t flags; int error; /* * Mount flags are now 64-bits. On 32-bit architectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG_TEXT(fstype); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) return (ENOENT); if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 && vfsp->vfc_vfsops_sd->vfs_cmount == NULL) || ((vfsp->vfc_flags & VFCF_SBDRY) == 0 && vfsp->vfc_vfsops->vfs_cmount == NULL)) return (EOPNOTSUPP); ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); if ((vfsp->vfc_flags & VFCF_SBDRY) != 0) return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags)); return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags)); } /* * vfs_domount_first(): first file system mount (not update) */ static int vfs_domount_first( struct thread *td, /* Calling thread. */ struct vfsconf *vfsp, /* File system type. */ char *fspath, /* Mount path. */ struct vnode *vp, /* Vnode to be covered. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vattr va; struct mount *mp; struct vnode *newdp, *rootvp; int error, error1; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); if ((fsflags & MNT_EMPTYDIR) != 0) { error = vfs_emptydir(vp); if (error != 0) { vput(vp); return (error); } } /* * If the jail of the calling thread lacks permission for this type of * file system, deny immediately. */ if (jailed(td->td_ucred) && !prison_allow(td->td_ucred, vfsp->vfc_prison_flag)) { vput(vp); return (EPERM); } /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred); if (error == 0 && va.va_uid != td->td_ucred->cr_uid) error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN); if (error == 0) error = vinvalbuf(vp, V_SAVE, 0, 0); if (error == 0 && vp->v_type != VDIR) error = ENOTDIR; if (error == 0) { VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) vp->v_iflag |= VI_MOUNT; else error = EBUSY; VI_UNLOCK(vp); } if (error != 0) { vput(vp); return (error); } vn_seqc_write_begin(vp); VOP_UNLOCK(vp); /* Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = *optlist; /* Set the mount level flags. */ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error1 = 0; if ((error = VFS_MOUNT(mp)) != 0 || (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 || (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) { if (error1 != 0) { error = error1; rootvp = vfs_cache_root_clear(mp); if (rootvp != NULL) vrele(rootvp); if ((error1 = VFS_UNMOUNT(mp, 0)) != 0) printf("VFS_UNMOUNT returned %d\n", error1); } vfs_unbusy(mp); mp->mnt_vnodecovered = NULL; vfs_mount_destroy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vn_seqc_write_end(vp); vrele(vp); return (error); } vn_seqc_write_begin(newdp); VOP_UNLOCK(newdp); if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; /* * Prevent external consumers of mount options from reading mnt_optnew. */ mp->mnt_optnew = NULL; MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); cache_purge(vp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; /* Place the new filesystem at the end of the mount list. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY); VOP_UNLOCK(vp); EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp); mount_devctl_event("MOUNT", mp, false); mountcheckdirs(vp, newdp); vn_seqc_write_end(vp); vn_seqc_write_end(newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); vfs_op_exit(mp); vfs_unbusy(mp); return (0); } /* * vfs_domount_update(): update of mounted file system */ static int vfs_domount_update( struct thread *td, /* Calling thread. */ struct vnode *vp, /* Mount point vnode. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct export_args export; struct o2export_args o2export; struct vnode *rootvp; void *bufp; struct mount *mp; int error, export_error, i, len; uint64_t flag; gid_t *grps; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); mp = vp->v_mount; if ((vp->v_vflag & VV_ROOT) == 0) { if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) == 0) error = EXDEV; else error = EINVAL; vput(vp); return (error); } /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ flag = mp->mnt_flag; if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error != 0) { vput(vp); return (error); } if (vfs_busy(mp, MBF_NOWAIT)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp); vfs_op_enter(mp); vn_seqc_write_begin(vp); rootvp = NULL; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); error = EBUSY; goto end; } mp->mnt_flag &= ~MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; rootvp = vfs_cache_root_clear(mp); MNT_IUNLOCK(mp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); export_error = 0; /* Process the export option. */ if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, &len) == 0) { /* Assume that there is only 1 ABI for each length. */ switch (len) { case (sizeof(struct oexport_args)): bzero(&o2export, sizeof(o2export)); /* FALLTHROUGH */ case (sizeof(o2export)): bcopy(bufp, &o2export, len); export.ex_flags = (uint64_t)o2export.ex_flags; export.ex_root = o2export.ex_root; export.ex_uid = o2export.ex_anon.cr_uid; export.ex_groups = NULL; export.ex_ngroups = o2export.ex_anon.cr_ngroups; if (export.ex_ngroups > 0) { if (export.ex_ngroups <= XU_NGROUPS) { export.ex_groups = malloc( export.ex_ngroups * sizeof(gid_t), M_TEMP, M_WAITOK); for (i = 0; i < export.ex_ngroups; i++) export.ex_groups[i] = o2export.ex_anon.cr_groups[i]; } else export_error = EINVAL; } else if (export.ex_ngroups < 0) export_error = EINVAL; export.ex_addr = o2export.ex_addr; export.ex_addrlen = o2export.ex_addrlen; export.ex_mask = o2export.ex_mask; export.ex_masklen = o2export.ex_masklen; export.ex_indexfile = o2export.ex_indexfile; export.ex_numsecflavors = o2export.ex_numsecflavors; if (export.ex_numsecflavors < MAXSECFLAVORS) { for (i = 0; i < export.ex_numsecflavors; i++) export.ex_secflavors[i] = o2export.ex_secflavors[i]; } else export_error = EINVAL; if (export_error == 0) export_error = vfs_export(mp, &export); free(export.ex_groups, M_TEMP); break; case (sizeof(export)): bcopy(bufp, &export, len); grps = NULL; if (export.ex_ngroups > 0) { if (export.ex_ngroups <= NGROUPS_MAX) { grps = malloc(export.ex_ngroups * sizeof(gid_t), M_TEMP, M_WAITOK); export_error = copyin(export.ex_groups, grps, export.ex_ngroups * sizeof(gid_t)); if (export_error == 0) export.ex_groups = grps; } else export_error = EINVAL; } else if (export.ex_ngroups == 0) export.ex_groups = NULL; else export_error = EINVAL; if (export_error == 0) export_error = vfs_export(mp, &export); free(grps, M_TEMP); break; default: export_error = EINVAL; break; } } MNT_ILOCK(mp); if (error == 0) { mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); } else { /* * If we fail, restore old mount flags. MNT_QUOTA is special, * because it is not part of MNT_UPDATEMASK, but it could have * changed in the meantime if quotactl(2) was called. * All in all we want current value of MNT_QUOTA, not the old * one. */ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); } if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if (error != 0) goto end; mount_devctl_event("REMOUNT", mp, true); if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); else vfs_deallocate_syncvnode(mp); end: vfs_op_exit(mp); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vrele(rootvp); } vn_seqc_write_end(vp); vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error != 0 ? error : export_error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vfsconf *vfsp; struct nameidata nd; struct vnode *vp; char *pathbuf; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); } /* * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if ((fsflags & MNT_UPDATE) == 0) { if ((vp->v_vflag & VV_ROOT) != 0 && (fsflags & MNT_NOCOVER) != 0) { vput(vp); return (EBUSY); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); strcpy(pathbuf, fspath); error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); if (error == 0) { error = vfs_domount_first(td, vfsp, pathbuf, vp, fsflags, optlist); } free(pathbuf, M_TEMP); } else error = vfs_domount_update(td, vp, fsflags, optlist); return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int sys_unmount(struct thread *td, struct unmount_args *uap) { return (kern_unmount(td, uap->path, uap->flags)); } int kern_unmount(struct thread *td, const char *path, int flags) { struct nameidata nd; struct mount *mp; char *pathbuf; int error, id0, id1; AUDIT_ARG_VALUE(flags); if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } if (flags & MNT_BYFSID) { AUDIT_ARG_TEXT(pathbuf); /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } else { /* * Try to find global path for path argument. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, pathbuf, td); if (namei(&nd) == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, MNAMELEN); if (error == 0) vput(nd.ni_vp); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ return ((flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { vfs_rel(mp); return (EINVAL); } error = dounmount(mp, flags, td); return (error); } /* * Return error if any of the vnodes, ignoring the root vnode * and the syncer vnode, have non-zero usecount. * * This function is purely advisory - it can return false positives * and negatives. */ static int vfs_check_usecounts(struct mount *mp) { struct vnode *vp, *mvp; MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && vp->v_usecount != 0) { VI_UNLOCK(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (EBUSY); } VI_UNLOCK(vp); } return (0); } static void dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) { mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag &= ~mntkflags; if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); } vn_finished_write(mp); } /* * There are various reference counters associated with the mount point. * Normally it is permitted to modify them without taking the mnt ilock, * but this behavior can be temporarily disabled if stable value is needed * or callers are expected to block (e.g. to not allow new users during * forced unmount). */ void vfs_op_enter(struct mount *mp) { int cpu; MNT_ILOCK(mp); mp->mnt_vfs_ops++; if (mp->mnt_vfs_ops > 1) { MNT_IUNLOCK(mp); return; } vfs_op_barrier_wait(mp); CPU_FOREACH(cpu) { mp->mnt_ref += zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu); mp->mnt_lockref += zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu); mp->mnt_writeopcount += zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu); } if (mp->mnt_ref <= 0 || mp->mnt_lockref < 0 || mp->mnt_writeopcount < 0) panic("%s: invalid count(s) on mp %p: ref %d lockref %d writeopcount %d\n", __func__, mp, mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount); MNT_IUNLOCK(mp); vfs_assert_mount_counters(mp); } void vfs_op_exit_locked(struct mount *mp) { mtx_assert(MNT_MTX(mp), MA_OWNED); if (mp->mnt_vfs_ops <= 0) panic("%s: invalid vfs_ops count %d for mp %p\n", __func__, mp->mnt_vfs_ops, mp); mp->mnt_vfs_ops--; } void vfs_op_exit(struct mount *mp) { MNT_ILOCK(mp); vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); } struct vfs_op_barrier_ipi { struct mount *mp; struct smp_rendezvous_cpus_retry_arg srcra; }; static void vfs_op_action_func(void *arg) { struct vfs_op_barrier_ipi *vfsopipi; struct mount *mp; vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); mp = vfsopipi->mp; if (!vfs_op_thread_entered(mp)) smp_rendezvous_cpus_done(arg); } static void vfs_op_wait_func(void *arg, int cpu) { struct vfs_op_barrier_ipi *vfsopipi; struct mount *mp; int *in_op; vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); mp = vfsopipi->mp; in_op = zpcpu_get_cpu(mp->mnt_thread_in_ops_pcpu, cpu); while (atomic_load_int(in_op)) cpu_spinwait(); } void vfs_op_barrier_wait(struct mount *mp) { struct vfs_op_barrier_ipi vfsopipi; vfsopipi.mp = mp; smp_rendezvous_cpus_retry(all_cpus, smp_no_rendezvous_barrier, vfs_op_action_func, smp_no_rendezvous_barrier, vfs_op_wait_func, &vfsopipi.srcra); } #ifdef DIAGNOSTIC void vfs_assert_mount_counters(struct mount *mp) { int cpu; if (mp->mnt_vfs_ops == 0) return; CPU_FOREACH(cpu) { if (*zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 || *zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 || *zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0) vfs_dump_mount_counters(mp); } } void vfs_dump_mount_counters(struct mount *mp) { int cpu, *count; int ref, lockref, writeopcount; printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); printf(" ref : "); ref = mp->mnt_ref; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu); printf("%d ", *count); ref += *count; } printf("\n"); printf(" lockref : "); lockref = mp->mnt_lockref; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu); printf("%d ", *count); lockref += *count; } printf("\n"); printf("writeopcount: "); writeopcount = mp->mnt_writeopcount; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu); printf("%d ", *count); writeopcount += *count; } printf("\n"); printf("counter struct total\n"); printf("ref %-5d %-5d\n", mp->mnt_ref, ref); printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); panic("invalid counts on struct mount"); } #endif int vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) { int *base, *pcpu; int cpu, sum; switch (which) { case MNT_COUNT_REF: base = &mp->mnt_ref; pcpu = mp->mnt_ref_pcpu; break; case MNT_COUNT_LOCKREF: base = &mp->mnt_lockref; pcpu = mp->mnt_lockref_pcpu; break; case MNT_COUNT_WRITEOPCOUNT: base = &mp->mnt_writeopcount; pcpu = mp->mnt_writeopcount_pcpu; break; } sum = *base; CPU_FOREACH(cpu) { sum += *zpcpu_get_cpu(pcpu, cpu); } return (sum); } /* * Do the actual filesystem unmount. */ int dounmount(struct mount *mp, int flags, struct thread *td) { struct vnode *coveredvp, *rootvp; int error; uint64_t async_flag; int mnt_gen_r; if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); vfs_rel(mp); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error != 0) { if (coveredvp != NULL) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); } vfs_rel(mp); return (error); } vfs_op_enter(mp); vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || (mp->mnt_flag & MNT_UPDATE) != 0 || !TAILQ_EMPTY(&mp->mnt_uppers)) { dounmount_cleanup(mp, coveredvp, 0); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT; rootvp = vfs_cache_root_clear(mp); if (coveredvp != NULL) vn_seqc_write_begin(coveredvp); if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { vn_seqc_write_end(coveredvp); dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vrele(rootvp); } return (error); } } /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) { mp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(mp); /* * Must be done after setting MNTK_UNMOUNTF and before * waiting for mnt_lockref to become 0. */ VFS_PURGE(mp); MNT_ILOCK(mp); } error = 0; if (mp->mnt_lockref) { mp->mnt_kern_flag |= MNTK_DRAINING; error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, "mount drain", 0); } MNT_IUNLOCK(mp); KASSERT(mp->mnt_lockref == 0, ("%s: invalid lock refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); KASSERT(error == 0, ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); /* * We want to keep the vnode around so that we can vn_seqc_write_end * after we are done with unmount. Downgrade our reference to a mere * hold count so that we don't interefere with anything. */ if (rootvp != NULL) { vhold(rootvp); vrele(rootvp); } if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_periodic(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); cache_purgevfs(mp, false); /* remove cache entries for this file sys */ vfs_deallocate_syncvnode(mp); error = VFS_UNMOUNT(mp, flags); vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0) { MNT_IUNLOCK(mp); vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp) { vn_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); vdrop(coveredvp); } if (rootvp != NULL) { vn_seqc_write_end(rootvp); vdrop(rootvp); } return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; vn_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); vdrop(coveredvp); } mount_devctl_event("UNMOUNT", mp, false); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vdrop(rootvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (rootvnode != NULL && mp == rootvnode->v_mount) { vrele(rootvnode); rootvnode = NULL; } if (mp == rootdevmp) rootdevmp = NULL; vfs_mount_destroy(mp); return (0); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) { va_list ap; int error, len; char *errmsg; error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; snprintf(errmsg, sizeof(errmsg), "mount option <%s> is unknown", p); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); break; } } if (opt == NULL) printf("%s\n", errmsg); } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len) { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; if (opts == NULL) return (-1); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; return (opt->pos); } } return (-1); } int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) { char *opt_value, *vtp; quad_t iv; int error, opt_len; error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); if (error != 0) return (error); if (opt_len == 0 || opt_value == NULL) return (EINVAL); if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') return (EINVAL); iv = strtoq(opt_value, &vtp, 0); if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) return (EINVAL); if (iv < 0) return (EINVAL); switch (vtp[0]) { case 't': case 'T': iv *= 1024; /* FALLTHROUGH */ case 'g': case 'G': iv *= 1024; /* FALLTHROUGH */ case 'm': case 'M': iv *= 1024; /* FALLTHROUGH */ case 'k': case 'K': iv *= 1024; case '\0': break; default: return (EINVAL); } *value = iv; return (0); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || ((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len != len) return (EINVAL); bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len < len) return (EINVAL); opt->len = len; bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = strlen(value) + 1; else if (strlcpy(opt->value, value, opt->len) >= opt->len) return (EINVAL); return (0); } return (ENOENT); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len) { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } int __vfs_statfs(struct mount *mp, struct statfs *sbp) { /* * Filesystems only fill in part of the structure for updates, we * have to read the entirety first to get all content. */ if (sbp != &mp->mnt_stat) memcpy(sbp, &mp->mnt_stat, sizeof(*sbp)); /* * Set these in case the underlying filesystem fails to do so. */ sbp->f_version = STATFS_VERSION; sbp->f_namemax = NAME_MAX; sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; return (mp->mnt_op->vfs_statfs(mp, sbp)); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, treat value as a C string. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, uint64_t flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); } va_end(ap); error = kernel_mount(ma, flags); return (error); } /* Map from mount options to printable formats. */ static struct mntoptnames optnames[] = { MNTOPT_NAMES }; static void mount_devctl_event_mntopt(struct sbuf *sb, const char *what, struct vfsoptlist *opts) { struct vfsopt *opt; if (opts == NULL || TAILQ_EMPTY(opts)) return; sbuf_printf(sb, " %s=\"", what); TAILQ_FOREACH(opt, opts, link) { if (opt->name[0] == '\0' || (opt->len > 0 && *(char *)opt->value == '\0')) continue; devctl_safe_quote_sb(sb, opt->name); if (opt->len > 0) { sbuf_putc(sb, '='); devctl_safe_quote_sb(sb, opt->value); } sbuf_putc(sb, ';'); } sbuf_putc(sb, '"'); } #define DEVCTL_LEN 1024 static void mount_devctl_event(const char *type, struct mount *mp, bool donew) { const uint8_t *cp; struct mntoptnames *fp; struct sbuf sb; struct statfs *sfp = &mp->mnt_stat; char *buf; buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT); if (buf == NULL) return; sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN); sbuf_cpy(&sb, "mount-point=\""); devctl_safe_quote_sb(&sb, sfp->f_mntonname); sbuf_cat(&sb, "\" mount-dev=\""); devctl_safe_quote_sb(&sb, sfp->f_mntfromname); sbuf_cat(&sb, "\" mount-type=\""); devctl_safe_quote_sb(&sb, sfp->f_fstypename); sbuf_cat(&sb, "\" fsid=0x"); cp = (const uint8_t *)&sfp->f_fsid.val[0]; for (int i = 0; i < sizeof(sfp->f_fsid); i++) sbuf_printf(&sb, "%02x", cp[i]); sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner); for (fp = optnames; fp->o_opt != 0; fp++) { if ((mp->mnt_flag & fp->o_opt) != 0) { sbuf_cat(&sb, fp->o_name); sbuf_putc(&sb, ';'); } } sbuf_putc(&sb, '"'); mount_devctl_event_mntopt(&sb, "opt", mp->mnt_opt); if (donew) mount_devctl_event_mntopt(&sb, "optnew", mp->mnt_optnew); sbuf_finish(&sb); if (sbuf_error(&sb) == 0) devctl_notify("VFS", "FS", type, sbuf_data(&sb)); sbuf_delete(&sb); free(buf, M_MOUNT); } Index: head/sys/net/if_ethersubr.c =================================================================== --- head/sys/net/if_ethersubr.c (revision 364441) +++ head/sys/net/if_ethersubr.c (revision 364442) @@ -1,1466 +1,1466 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include "opt_rss.h" #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #ifdef VIMAGE static void ether_reassign(struct ifnet *, struct vnet *, char *); #endif static int ether_requestencap(struct ifnet *, struct if_encap_req *); #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Handle link-layer encapsulation requests. */ static int ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) { struct ether_header *eh; struct arphdr *ah; uint16_t etype; const u_char *lladdr; if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < ETHER_HDR_LEN) return (ENOMEM); eh = (struct ether_header *)req->buf; lladdr = req->lladdr; req->lladdr_off = 0; switch (req->family) { case AF_INET: etype = htons(ETHERTYPE_IP); break; case AF_INET6: etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_ETHER); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); break; } if (req->flags & IFENCAP_FLAG_BROADCAST) lladdr = ifp->if_broadcastaddr; break; default: return (EAFNOSUPPORT); } memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); req->bufsize = sizeof(struct ether_header); return (0); } static int ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro, u_char *phdr, uint32_t *pflags, struct llentry **plle) { struct ether_header *eh; uint32_t lleflags = 0; int error = 0; #if defined(INET) || defined(INET6) uint16_t etype; #endif if (plle) *plle = NULL; eh = (struct ether_header *)phdr; switch (dst->sa_family) { #ifdef INET case AF_INET: if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { if (m->m_flags & M_BCAST) memcpy(eh->ether_dhost, ifp->if_broadcastaddr, ETHER_ADDR_LEN); else { const struct in_addr *a; a = &(((const struct sockaddr_in *)dst)->sin_addr); ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); } etype = htons(ETHERTYPE_IP); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if ((m->m_flags & M_MCAST) == 0) error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { const struct in6_addr *a6; a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); etype = htons(ETHERTYPE_IPV6); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); if (m != NULL) m_freem(m); return (EAFNOSUPPORT); } if (error == EHOSTDOWN) { if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) error = EHOSTUNREACH; } if (error != 0) return (error); *pflags = RT_MAY_LOOP; if (lleflags & LLE_IFADDR) *pflags |= RT_L2_ME; return (0); } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { int error = 0; char linkhdr[ETHER_HDR_LEN], *phdr; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ uint32_t pflags; struct llentry *lle = NULL; int addref = 0; phdr = NULL; pflags = 0; if (ro != NULL) { /* XXX BPF uses ro_prepend */ if (ro->ro_prepend != NULL) { phdr = ro->ro_prepend; hlen = ro->ro_plen; } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { if ((ro->ro_flags & RT_LLE_CACHE) != 0) { lle = ro->ro_lle; if (lle != NULL && (lle->la_flags & LLE_VALID) == 0) { LLE_FREE(lle); lle = NULL; /* redundant */ ro->ro_lle = NULL; } if (lle == NULL) { /* if we lookup, keep cache */ addref = 1; } else /* * Notify LLE code that * the entry was used * by datapath. */ llentry_mark_used(lle); } if (lle != NULL) { phdr = lle->r_linkdata; hlen = lle->r_hdrlen; pflags = lle->r_flags; } } } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); if (phdr == NULL) { /* No prepend data supplied. Try to calculate ourselves. */ phdr = linkhdr; hlen = ETHER_HDR_LEN; error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags, addref ? &lle : NULL); if (addref && lle != NULL) ro->ro_lle = lle; if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } loop_copy = pflags & RT_MAY_LOOP; /* * Add local net header. If no space in first mbuf, * allocate another. * * Note that we do prepend regardless of RT_HAS_HEADER flag. * This is done because BPF code shifts m_data pointer * to the end of ethernet header prior to calling if_output(). */ M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); if ((pflags & RT_HAS_HEADER) == 0) { eh = mtod(m, struct ether_header *); memcpy(eh, phdr, hlen); } /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); (void)if_simloop(ifp, n, dst->sa_family, hlen); } else if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } static bool ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp) { struct ether_header *eh; eh = mtod(*mp, struct ether_header *); if (ntohs(eh->ether_type) == ETHERTYPE_VLAN || ether_8021q_frame(mp, ifp, ifp, 0, pcp)) return (true); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (false); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { uint8_t pcp; pcp = ifp->if_pcp; if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN && !ether_set_pcp(&m, ifp, pcp)) return (0); if (PFIL_HOOKED_OUT(V_link_pfil_head)) switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT, NULL)) { case PFIL_DROPPED: return (EACCES); case PFIL_CONSUMED: return (0); } #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return (EAFNOSUPPORT); /* NOTREACHED */ break; }; } #endif #endif /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); } /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); random_harvest_queue_ether(m, sizeof(*m)); #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { switch (etype) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return; /* NOTREACHED */ break; }; } #endif #endif CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. However, if RSS is enabled, hook up RSS affinity * so that when deferred or hybrid dispatch is enabled, we can redistribute * load based on RSS. * * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or * not it had already done work distribution via multi-queue. Then we could * direct dispatch in the event load balancing was already complete and * handle the case of interfaces with different capabilities better. * * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions * at multiple layers? * * XXXRW: For now, enable all this only if RSS is compiled in, although it * works fine without RSS. Need to characterise the performance overhead * of the detour through the netisr code in the event the result is always * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: NULL interface pointer", __func__)); ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_DIRECT, .nh_m2cpuid = rss_m2cpuid, #else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, #endif }; static void ether_init(__unused void *arg) { netisr_register(ðer_nh); } SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { struct pfil_head_args args; args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_ETHERNET; args.pa_headname = PFIL_ETHER_NAME; V_link_pfil_head = pfil_head_register(&args); #ifdef VIMAGE netisr_register_vnet(ðer_nh); #endif } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); #ifdef VIMAGE static void vnet_ether_pfil_destroy(__unused void *arg) { pfil_head_unregister(V_link_pfil_head); } VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, vnet_ether_pfil_destroy, NULL); static void vnet_ether_destroy(__unused void *arg) { netisr_unregister_vnet(ðer_nh); } VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); #endif static void ether_input(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct mbuf *mn; bool needs_epoch; needs_epoch = !(ifp->if_flags & IFF_KNOWSEPOCH); /* * The drivers are allowed to pass in a chain of packets linked with * m_nextpkt. We split them up into separate packets here and pass * them up. This allows the drivers to amortize the receive lock. */ CURVNET_SET_QUIET(ifp->if_vnet); if (__predict_false(needs_epoch)) NET_EPOCH_ENTER(et); while (m) { mn = m->m_nextpkt; m->m_nextpkt = NULL; /* * We will rely on rcvif being set properly in the deferred * context, so assert it is correct here. */ MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); netisr_dispatch(NETISR_ETHER, m); m = mn; } if (__predict_false(needs_epoch)) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; NET_EPOCH_ASSERT(); KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; ifp->if_mtu = ETHERMTU; if_attach(ifp); ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (ifp->if_hw_addr != NULL) bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); /* Add necessary bits are setup; announce it now. */ EVENTHANDLER_INVOKE(ether_ifattach_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ETHER_ADDR_LEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; case SIOCSLANPCP: error = priv_check(curthread, PRIV_NET_SETLANPCP); if (error != 0) break; if (ifr->ifr_lan_pcp > 7 && ifr->ifr_lan_pcp != IFNET_PCP_NONE) { error = EINVAL; } else { ifp->if_pcp = ifr->ifr_lan_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); } break; case SIOCGLANPCP: ifr->ifr_lan_pcp = ifp->if_pcp; break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static moduledata_t ether_mod = { .name = "ether", }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); return (m); } static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IEEE 802.1Q VLAN"); static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "for consistency"); VNET_DEFINE_STATIC(int, soft_pad); #define V_soft_pad VNET(soft_pad) SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(soft_pad), 0, "pad short frames before tagging"); /* * For now, make preserving PCP via an mbuf tag optional, as it increases * per-packet memory allocations and frees. In the future, it would be * preferable to reuse ether_vtag for this, or similar. */ int vlan_mtag_pcp = 0; SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW, &vlan_mtag_pcp, 0, "Retain VLAN PCP information as packets are passed up the stack"); bool ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, uint16_t vid, uint8_t pcp) { struct m_tag *mtag; int n; uint16_t tag; static const char pad[8]; /* just zeros */ /* * Pad the frame to the minimum size allowed if told to. * This option is in accord with IEEE Std 802.1Q, 2003 Ed., * paragraph C.4.4.3.b. It can help to work around buggy * bridges that violate paragraph C.4.4.3.a from the same * document, i.e., fail to pad short frames after untagging. * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but * untagging it will produce a 62-byte frame, which is a runt * and requires padding. There are VLAN-enabled network * devices that just discard such runts instead or mishandle * them somehow. */ if (V_soft_pad && p->if_type == IFT_ETHER) { for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len; n > 0; n -= sizeof(pad)) { if (!m_append(*mp, min(n, sizeof(pad)), pad)) break; } if (n > 0) { m_freem(*mp); *mp = NULL; if_printf(ife, "cannot pad short frame"); return (false); } } /* * If underlying interface can do VLAN tag insertion itself, * just pass the packet along. However, we need some way to * tell the interface where the packet came from so that it * knows how to find the VLAN tag to use, so we attach a * packet tag that holds it. */ if (vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL)) != NULL) tag = EVL_MAKETAG(vid, *(uint8_t *)(mtag + 1), 0); else tag = EVL_MAKETAG(vid, pcp, 0); if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { (*mp)->m_pkthdr.ether_vtag = tag; (*mp)->m_flags |= M_VLANTAG; } else { *mp = ether_vlanencap(*mp, tag); if (*mp == NULL) { if_printf(ife, "unable to prepend 802.1Q header"); return (false); } } return (true); } /* * Allocate an address from the FreeBSD Foundation OUI. This uses a * cryptographic hash function on the containing jail's name, UUID and the * interface name to attempt to provide a unique but stable address. * Pseudo-interfaces which require a MAC address should use this function to * allocate non-locally-administered addresses. */ void ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr) { SHA1_CTX ctx; char *buf; char uuid[HOSTUUIDLEN + 1]; uint64_t addr; int i, sz; char digest[SHA1_RESULTLEN]; char jailname[MAXHOSTNAMELEN]; getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid)); /* If each (vnet) jail would also have a unique hostuuid this would not * be necessary. */ getjailname(curthread->td_ucred, jailname, sizeof(jailname)); sz = asprintf(&buf, M_TEMP, "%s-%s-%s", uuid, if_name(ifp), jailname); if (sz < 0) { /* Fall back to a random mac address. */ arc4rand(hwaddr, sizeof(*hwaddr), 0); hwaddr->octet[0] = 0x02; return; } SHA1Init(&ctx); SHA1Update(&ctx, buf, sz); SHA1Final(digest, &ctx); free(buf, M_TEMP); addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & OUI_FREEBSD_GENERATED_MASK; addr = OUI_FREEBSD(addr); for (i = 0; i < ETHER_ADDR_LEN; ++i) { hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & 0xFF; } } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); Index: head/sys/netinet/ip_carp.c =================================================================== --- head/sys/netinet/ip_carp.c (revision 364441) +++ head/sys/netinet/ip_carp.c (revision 364442) @@ -1,2317 +1,2317 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Michael Shalayeff. * Copyright (c) 2003 Ryan McBride. * Copyright (c) 2011 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #include #include #include #endif #include static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ struct ifaddr **sc_ifas; /* Our ifaddrs. */ struct sockaddr_dl sc_addr; /* Our link level address. */ struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 struct callout sc_md6_tmo; /* XXX: Master down timeout. */ #endif struct mtx sc_mtx; int sc_vhid; int sc_advskew; int sc_advbase; int sc_naddrs; int sc_naddrs6; int sc_ifasiz; enum { INIT = 0, BACKUP, MASTER } sc_state; int sc_suppress; int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 int sc_init_counter; uint64_t sc_counter; /* authentication */ #define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; struct carp_if { #ifdef INET int cif_naddrs; #endif #ifdef INET6 int cif_naddrs6; #endif TAILQ_HEAD(, carp_softc) cif_vrs; #ifdef INET struct ip_moptions cif_imo; #endif #ifdef INET6 struct ip6_moptions cif_im6o; #endif struct ifnet *cif_ifp; struct mtx cif_mtx; uint32_t cif_flags; #define CIF_PROMISC 0x00000001 }; #define CARP_INET 0 #define CARP_INET6 1 static int proto_reg[] = {-1, -1}; /* * Brief design of carp(4). * * Any carp-capable ifnet may have a list of carp softcs hanging off * its ifp->if_carp pointer. Each softc represents one unique virtual * host id, or vhid. The softc has a back pointer to the ifnet. All * softcs are joined in a global list, which has quite limited use. * * Any interface address that takes part in CARP negotiation has a * pointer to the softc of its vhid, ifa->ifa_carp. That could be either * AF_INET or AF_INET6 address. * * Although, one can get the softc's backpointer to ifnet and traverse * through its ifp->if_addrhead queue to find all interface addresses * involved in CARP, we keep a growable array of ifaddr pointers. This * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that * do calls into the network stack, thus avoiding LORs. * * Locking: * * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), * callout-driven events and ioctl()s. * * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx. * To traverse the global list we use the mutex carp_mtx. * * Known issues with locking: * * - Sending ad, we put the pointer to the softc in an mtag, and no reference * counting is done on the softc. * - On module unload we may race (?) with packet processing thread * dereferencing our function pointers. */ /* Accept incoming CARP packets. */ VNET_DEFINE_STATIC(int, carp_allow) = 1; #define V_carp_allow VNET(carp_allow) /* Set DSCP in outgoing CARP packets. */ VNET_DEFINE_STATIC(int, carp_dscp) = 56; #define V_carp_dscp VNET(carp_dscp) /* Preempt slower nodes. */ VNET_DEFINE_STATIC(int, carp_preempt) = 0; #define V_carp_preempt VNET(carp_preempt) /* Log level. */ VNET_DEFINE_STATIC(int, carp_log) = 1; #define V_carp_log VNET(carp_log) /* Global advskew demotion. */ VNET_DEFINE_STATIC(int, carp_demotion) = 0; #define V_carp_demotion VNET(carp_demotion) /* Send error demotion factor. */ VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW; #define V_carp_senderr_adj VNET(carp_senderr_adj) /* Iface down demotion factor. */ VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW; #define V_carp_ifdown_adj VNET(carp_ifdown_adj) static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS); static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS); static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "CARP"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, carp_allow_sysctl, "I", "Accept incoming CARP packets"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, carp_dscp_sysctl, "I", "DSCP value for carp packets"); SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_log), 0, "CARP log level"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, carp_demote_adj_sysctl, "I", "Adjust demotion factor (skew of advskew)"); SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment"); SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_ifdown_adj), 0, "Interface down demotion factor adjustment"); VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats); VNET_PCPUSTAT_SYSINIT(carpstats); VNET_PCPUSTAT_SYSUNINIT(carpstats); #define CARPSTATS_ADD(name, val) \ counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \ sizeof(uint64_t)], (val)) #define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) #define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) #define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) #define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) #define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ NULL, MTX_DEF) #define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) #define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) #define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) #define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) #define CIF_FREE(cif) do { \ CIF_LOCK(cif); \ if (TAILQ_EMPTY(&(cif)->cif_vrs)) \ carp_free_if(cif); \ else \ CIF_UNLOCK(cif); \ } while (0) #define CARP_LOG(...) do { \ if (V_carp_log > 0) \ log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ if (V_carp_log > 1) \ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) #define IFNET_FOREACH_IFA(ifp, ifa) \ CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ if ((ifa)->ifa_carp != NULL) #define CARP_FOREACH_IFA(sc, ifa) \ CARP_LOCK_ASSERT(sc); \ for (int _i = 0; \ _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ ((ifa) = sc->sc_ifas[_i]) != NULL; \ ++_i) #define IFNET_FOREACH_CARP(ifp, sc) \ KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) || \ sx_xlocked(&carp_sx), ("cif_vrs not locked")); \ TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) #define DEMOTE_ADVSKEW(sc) \ (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \ CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion)) static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); static struct carp_softc *carp_alloc(struct ifnet *); static void carp_destroy(struct carp_softc *); static struct carp_if *carp_alloc_if(struct ifnet *); static void carp_free_if(struct carp_if *); static void carp_set_state(struct carp_softc *, int, const char* reason); static void carp_sc_state(struct carp_softc *); static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *, const char* reason); static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); static void carp_addroute(struct carp_softc *); static void carp_ifa_addroute(struct ifaddr *); static void carp_delroute(struct carp_softc *); static void carp_ifa_delroute(struct ifaddr *); static void carp_send_ad_all(void *, int); static void carp_demote_adj(int, char *); static LIST_HEAD(, carp_softc) carp_list; static struct mtx carp_mtx; static struct sx carp_sx; static struct task carp_sendall_task = TASK_INITIALIZER(0, carp_send_ad_all, NULL); static void carp_hmac_prepare(struct carp_softc *sc) { uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET struct in_addr last, cur, in; #endif #ifdef INET6 struct in6_addr last6, cur6, in6; #endif CARP_LOCK_ASSERT(sc); /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); #ifdef INET cur.s_addr = 0; do { found = 0; last = cur; cur.s_addr = 0xffffffff; CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && ntohl(in.s_addr) < ntohl(cur.s_addr)) { cur.s_addr = in.s_addr; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); #endif /* INET */ #ifdef INET6 memset(&cur6, 0, sizeof(cur6)); do { found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; if (ifa->ifa_addr->sa_family == AF_INET6 && memcmp(&in6, &last6, sizeof(in6)) > 0 && memcmp(&in6, &cur6, sizeof(in6)) < 0) { cur6 = in6; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); #endif /* INET6 */ /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; } static void carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; CARP_LOCK_ASSERT(sc); /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); SHA1Final(md, &sha1ctx); /* outer hash */ SHA1Init(&sha1ctx); SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sha1ctx, md, 20); SHA1Final(md, &sha1ctx); } static int carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } /* * process input packet. * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ #ifdef INET int carp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct carp_header *ch; int iplen, len; iplen = *offp; *mp = NULL; CARPSTATS_INC(carps_ipackets); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255. */ if (ip->ip_ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip->ip_ttl, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } iplen = ip->ip_hl << 2; if (m->m_pkthdr.len < iplen + sizeof(*ch)) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " "on %s\n", __func__, m->m_len - sizeof(struct ip), m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { CARPSTATS_INC(carps_hdrops); CARP_DEBUG("%s: pullup failed\n", __func__); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } ch = (struct carp_header *)((char *)ip + iplen); /* * verify that the received packet length is * equal to the CARP header */ len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet too short %d on %s\n", __func__, m->m_pkthdr.len, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } if ((m = m_pullup(m, len)) == NULL) { CARPSTATS_INC(carps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); ch = (struct carp_header *)((char *)ip + iplen); /* verify the CARP checksum */ m->m_data += iplen; if (in_cksum(m, len - iplen)) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iplen; carp_input_c(m, ch, AF_INET); return (IPPROTO_DONE); } #endif #ifdef INET6 int carp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct carp_header *ch; u_int len; CARPSTATS_INC(carps_ipackets6); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); CARP_DEBUG("%s: packet received on non-carp interface: %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255 */ if (ip6->ip6_hlim != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that we have a complete carp packet */ if (m->m_len < *offp + sizeof(*ch)) { len = m->m_len; m = m_pullup(m, *offp + sizeof(*ch)); if (m == NULL) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } } ch = (struct carp_header *)(mtod(m, char *) + *offp); /* verify the CARP checksum */ m->m_data += *offp; if (in_cksum(m, sizeof(*ch))) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed, on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= *offp; carp_input_c(m, ch, AF_INET6); return (IPPROTO_DONE); } #endif /* INET6 */ /* * This routine should not be necessary at all, but some switches * (VMWare ESX vswitches) can echo our own packets back at us, * and we must ignore them or they will cause us to drop out of * MASTER mode. * * We cannot catch all cases of network loops. Instead, what we * do here is catch any packet that arrives with a carp header * with a VHID of 0, that comes from an address that is our own. * These packets are by definition "from us" (even if they are from * a misconfigured host that is pretending to be us). * * The VHID test is outside this mini-function. */ static int carp_source_is_self(struct mbuf *m, struct ifaddr *ifa, sa_family_t af) { #ifdef INET struct ip *ip4; struct in_addr in4; #endif #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; #endif switch (af) { #ifdef INET case AF_INET: ip4 = mtod(m, struct ip *); in4 = ifatoia(ifa)->ia_addr.sin_addr; return (in4.s_addr == ip4->ip_src.s_addr); #endif #ifdef INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); in6 = ifatoia6(ifa)->ia_addr.sin6_addr; return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0); #endif default: break; } return (0); } static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa, *match; struct carp_softc *sc; uint64_t tmp_counter; struct timeval sc_tv, ch_tv; int error; NET_EPOCH_ASSERT(); /* * Verify that the VHID is valid on the receiving interface. * * There should be just one match. If there are none * the VHID is not valid and we drop the packet. If * there are multiple VHID matches, take just the first * one, for compatibility with previous code. While we're * scanning, check for obvious loops in the network topology * (these should never happen, and as noted above, we may * miss real loops; this is just a double-check). */ error = 0; match = NULL; IFNET_FOREACH_IFA(ifp, ifa) { if (match == NULL && ifa->ifa_carp != NULL && ifa->ifa_addr->sa_family == af && ifa->ifa_carp->sc_vhid == ch->carp_vhid) match = ifa; if (ch->carp_vhid == 0 && carp_source_is_self(m, ifa, af)) error = ELOOP; } ifa = error ? NULL : match; if (ifa != NULL) ifa_ref(ifa); if (ifa == NULL) { if (error == ELOOP) { CARP_DEBUG("dropping looped packet on interface %s\n", ifp->if_xname); CARPSTATS_INC(carps_badif); /* ??? */ } else { CARPSTATS_INC(carps_badvhid); } m_freem(m); return; } /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname, ch->carp_version); ifa_free(ifa); m_freem(m); return; } sc = ifa->ifa_carp; CARP_LOCK(sc); ifa_free(ifa); if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, sc->sc_vhid, ifp->if_xname); goto out; } tmp_counter = ntohl(ch->carp_counter[0]); tmp_counter = tmp_counter<<32; tmp_counter += ntohl(ch->carp_counter[1]); /* XXX Replay protection goes here */ sc->sc_init_counter = 0; sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256; ch_tv.tv_sec = ch->carp_advbase; ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; switch (sc->sc_state) { case INIT: break; case MASTER: /* * If we receive an advertisement from a master who's going to * be more frequent than us, go into BACKUP state. */ if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "more frequent advertisement received"); carp_setrun(sc, 0); carp_delroute(sc); } break; case BACKUP: /* * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "preempting a slower master"); break; } /* * If the master is going to advertise at such a low frequency * that he's guaranteed to time out, we'd might as well just * treat him as timed out now. */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "master will time out"); break; } /* * Otherwise, we reset the counter and wait for the next * advertisement. */ carp_setrun(sc, af); break; } out: CARP_UNLOCK(sc); m_freem(m); } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ sc->sc_counter = arc4random(); sc->sc_counter = sc->sc_counter << 32; sc->sc_counter += arc4random(); } else sc->sc_counter++; ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT)) == NULL) { m_freem(m); CARPSTATS_INC(carps_onomem); return (ENOMEM); } bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (0); } /* * To avoid LORs and possible recursions this function shouldn't * be called directly, but scheduled via taskqueue. */ static void carp_send_ad_all(void *ctx __unused, int pending __unused) { struct carp_softc *sc; struct epoch_tracker et; NET_EPOCH_ENTER(et); mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) if (sc->sc_state == MASTER) { CARP_LOCK(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); NET_EPOCH_EXIT(et); } /* Send a periodic advertisement, executed in callout context. */ static void carp_send_ad(void *v) { struct carp_softc *sc = v; struct epoch_tracker et; NET_EPOCH_ENTER(et); CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); NET_EPOCH_EXIT(et); } static void carp_send_ad_error(struct carp_softc *sc, int error) { if (error) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { static const char fmt[] = "send error %d on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, error, sc->sc_carpdev->if_xname); carp_demote_adj(V_carp_senderr_adj, msg); } sc->sc_sendad_success = 0; } else { if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS && ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { static const char fmt[] = "send ok on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, sc->sc_carpdev->if_xname); carp_demote_adj(-V_carp_senderr_adj, msg); sc->sc_sendad_errors = 0; } else sc->sc_sendad_errors = 0; } } /* * Pick the best ifaddr on the given ifp for sending CARP * advertisements. * * "Best" here is defined by ifa_preferred(). This function is much * much like ifaof_ifpforaddr() except that we just use ifa_preferred(). * * (This could be simplified to return the actual address, except that * it has a different format in AF_INET and AF_INET6.) */ static struct ifaddr * carp_best_ifa(int af, struct ifnet *ifp) { struct ifaddr *ifa, *best; NET_EPOCH_ASSERT(); if (af >= AF_MAX) return (NULL); best = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == af && (best == NULL || ifa_preferred(best, ifa))) best = ifa; } if (best != NULL) ifa_ref(best); return (best); } static void carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; int len, advskew; NET_EPOCH_ASSERT(); CARP_LOCK_ASSERT(sc); advskew = DEMOTE_ADVSKEW(sc); tv.tv_sec = sc->sc_advbase; tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; /* XXXGL: OpenBSD picks first ifaddr with needed family. */ #ifdef INET if (sc->sc_naddrs) { struct ip *ip; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; ip->ip_len = htons(len); ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; ip_fillid(ip); ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); if (ifa != NULL) { ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; ifa_free(ifa); } else ip->ip_src.s_addr = 0; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); CARPSTATS_INC(carps_opackets); carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_carpdev->if_carp->cif_imo, NULL)); } #endif /* INET */ #ifdef INET6 if (sc->sc_naddrs6) { struct ip6_hdr *ip6; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip6) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; /* Traffic class isn't defined in ip6 struct instead * it gets offset into flowid field */ ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + IPTOS_DSCP_OFFSET)); ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; /* set the source address */ ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); if (ifa != NULL) { bcopy(IFA_IN6(ifa), &ip6->ip6_src, sizeof(struct in6_addr)); ifa_free(ifa); } else /* This should never happen with IPv6. */ bzero(&ip6->ip6_src, sizeof(struct in6_addr)); /* Set the multicast destination. */ ip6->ip6_dst.s6_addr16[0] = htons(0xff02); ip6->ip6_dst.s6_addr8[15] = 0x12; if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); goto resched; } ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip6); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); CARPSTATS_INC(carps_opackets6); carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); } #endif /* INET6 */ resched: callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); } static void carp_addroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_addroute(ifa); } static void carp_ifa_addroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: in_addprefix(ifatoia(ifa), RTF_UP); ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); break; #endif #ifdef INET6 case AF_INET6: ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_add_ifa_lle(ifatoia6(ifa)); break; #endif } } static void carp_delroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_delroute(ifa); } static void carp_ifa_delroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); in_scrubprefix(ifatoia(ifa), LLE_STATIC); break; #endif #ifdef INET6 case AF_INET6: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_rem_ifa_lle(ifatoia6(ifa), 1); break; #endif } } int carp_master(struct ifaddr *ifa) { struct carp_softc *sc = ifa->ifa_carp; return (sc->sc_state == MASTER); } #ifdef INET /* * Broadcast a gratuitous ARP request containing * the virtual router MAC address for each IP address * associated with the virtual router. */ static void carp_send_arp(struct carp_softc *sc) { struct ifaddr *ifa; struct in_addr addr; NET_EPOCH_ASSERT(); CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr)); } } int carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) { struct carp_softc *sc = ifa->ifa_carp; if (sc->sc_state == MASTER) { *enaddr = LLADDR(&sc->sc_addr); return (1); } return (0); } #endif #ifdef INET6 static void carp_send_na(struct carp_softc *sc) { static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct ifaddr *ifa; struct in6_addr *in6; CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6 = IFA_IN6(ifa); nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } /* * Returns ifa in case it's a carp address and it is MASTER, or if the address * matches and is not a carp address. Returns NULL otherwise. */ struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); ifa = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) continue; if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER) ifa = NULL; else ifa_ref(ifa); break; } return (ifa); } char * carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); IFNET_FOREACH_IFA(ifp, ifa) if (ifa->ifa_addr->sa_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { struct carp_softc *sc = ifa->ifa_carp; struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT); if (mtag == NULL) /* Better a bit than nothing. */ return (LLADDR(&sc->sc_addr)); bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (LLADDR(&sc->sc_addr)); } return (NULL); } #endif /* INET6 */ int carp_forus(struct ifnet *ifp, u_char *dhost) { struct carp_softc *sc; uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) return (0); CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { /* * CARP_LOCK() is not here, since would protect nothing, but * cause deadlock with if_bridge, calling this under its lock. */ if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), ETHER_ADDR_LEN)) { CIF_UNLOCK(ifp->if_carp); return (1); } } CIF_UNLOCK(ifp->if_carp); return (0); } /* Master down timeout event, executed in callout context. */ static void carp_master_down(void *v) { struct carp_softc *sc = v; struct epoch_tracker et; NET_EPOCH_ENTER(et); CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); if (sc->sc_state == BACKUP) { carp_master_down_locked(sc, "master timed out"); } CURVNET_RESTORE(); CARP_UNLOCK(sc); NET_EPOCH_EXIT(et); } static void carp_master_down_locked(struct carp_softc *sc, const char *reason) { NET_EPOCH_ASSERT(); CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { case BACKUP: carp_set_state(sc, MASTER, reason); carp_send_ad_locked(sc); #ifdef INET carp_send_arp(sc); #endif #ifdef INET6 carp_send_na(sc); #endif carp_setrun(sc, 0); carp_addroute(sc); break; case INIT: case MASTER: #ifdef INVARIANTS panic("carp: VHID %u@%s: master_down event in %s state\n", sc->sc_vhid, sc->sc_carpdev->if_xname, sc->sc_state ? "MASTER" : "INIT"); #endif break; } } /* * When in backup state, af indicates whether to reset the master down timer * for v4 or v6. If it's set to zero, reset the ones which are already pending. */ static void carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; CARP_LOCK_ASSERT(sc); if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || sc->sc_carpdev->if_link_state != LINK_STATE_UP || (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) || !V_carp_allow) return; switch (sc->sc_state) { case INIT: carp_set_state(sc, BACKUP, "initialization complete"); carp_setrun(sc, 0); break; case BACKUP: callout_stop(&sc->sc_ad_tmo); tv.tv_sec = 3 * sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; switch (af) { #ifdef INET case AF_INET: callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif default: #ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); #endif #ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); #endif break; } break; case MASTER: tv.tv_sec = sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); break; } } /* * Setup multicast structures. */ static int carp_multicast_setup(struct carp_if *cif, sa_family_t sa) { struct ifnet *ifp = cif->cif_ifp; int error = 0; switch (sa) { #ifdef INET case AF_INET: { struct ip_moptions *imo = &cif->cif_imo; struct in_mfilter *imf; struct in_addr addr; if (ip_mfilter_first(&imo->imo_head) != NULL) return (0); imf = ip_mfilter_alloc(M_WAITOK, 0, 0); ip_mfilter_init(&imo->imo_head); imo->imo_multicast_vif = -1; addr.s_addr = htonl(INADDR_CARP_GROUP); if ((error = in_joingroup(ifp, &addr, NULL, &imf->imf_inm)) != 0) { ip_mfilter_free(imf); break; } ip_mfilter_insert(&imo->imo_head, imf); imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; break; } #endif #ifdef INET6 case AF_INET6: { struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_mfilter *im6f[2]; struct in6_addr in6; if (ip6_mfilter_first(&im6o->im6o_head)) return (0); im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0); im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0); ip6_mfilter_init(&im6o->im6o_head); im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; /* Join IPv6 CARP multicast group. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) { in6_leavegroup(im6f[0]->im6f_in6m, NULL); ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } ip6_mfilter_insert(&im6o->im6o_head, im6f[0]); ip6_mfilter_insert(&im6o->im6o_head, im6f[1]); break; } #endif } return (error); } /* * Free multicast structures. */ static void carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) { #ifdef INET struct ip_moptions *imo = &cif->cif_imo; struct in_mfilter *imf; #endif #ifdef INET6 struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_mfilter *im6f; #endif sx_assert(&carp_sx, SA_XLOCKED); switch (sa) { #ifdef INET case AF_INET: if (cif->cif_naddrs != 0) break; while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { ip_mfilter_remove(&imo->imo_head, imf); in_leavegroup(imf->imf_inm, NULL); ip_mfilter_free(imf); } break; #endif #ifdef INET6 case AF_INET6: if (cif->cif_naddrs6 != 0) break; while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) { ip6_mfilter_remove(&im6o->im6o_head, im6f); in6_leavegroup(im6f->im6f_in6m, NULL); ip6_mfilter_free(im6f); } break; #endif } } int carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) { struct m_tag *mtag; struct carp_softc *sc; if (!sa) return (0); switch (sa->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return (0); } mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); if (mtag == NULL) return (0); bcopy(mtag + 1, &sc, sizeof(sc)); /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_BRIDGE: case IFT_L2VLAN: { struct ether_header *eh; eh = mtod(m, struct ether_header *); eh->ether_shost[0] = 0; eh->ether_shost[1] = 0; eh->ether_shost[2] = 0x5e; eh->ether_shost[3] = 0; eh->ether_shost[4] = 1; eh->ether_shost[5] = sc->sc_vhid; } break; default: printf("%s: carp is not supported for the %d interface type\n", ifp->if_xname, ifp->if_type); return (EOPNOTSUPP); } return (0); } static struct carp_softc* carp_alloc(struct ifnet *ifp) { struct carp_softc *sc; struct carp_if *cif; sx_assert(&carp_sx, SA_XLOCKED); if ((cif = ifp->if_carp) == NULL) cif = carp_alloc_if(ifp); sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); sc->sc_advbase = CARP_DFLTINTV; sc->sc_vhid = -1; /* required setting */ sc->sc_init_counter = 1; sc->sc_state = INIT; sc->sc_ifasiz = sizeof(struct ifaddr *); sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); sc->sc_carpdev = ifp; CARP_LOCK_INIT(sc); #ifdef INET callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif #ifdef INET6 callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); CIF_LOCK(cif); TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_INSERT_HEAD(&carp_list, sc, sc_next); mtx_unlock(&carp_mtx); return (sc); } static void carp_grow_ifas(struct carp_softc *sc) { struct ifaddr **new; new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO); CARP_LOCK(sc); bcopy(sc->sc_ifas, new, sc->sc_ifasiz); free(sc->sc_ifas, M_CARP); sc->sc_ifas = new; sc->sc_ifasiz *= 2; CARP_UNLOCK(sc); } static void carp_destroy(struct carp_softc *sc) { struct ifnet *ifp = sc->sc_carpdev; struct carp_if *cif = ifp->if_carp; sx_assert(&carp_sx, SA_XLOCKED); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "vhid removed"); CARP_UNLOCK(sc); CIF_LOCK(cif); TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_REMOVE(sc, sc_next); mtx_unlock(&carp_mtx); callout_drain(&sc->sc_ad_tmo); #ifdef INET callout_drain(&sc->sc_md_tmo); #endif #ifdef INET6 callout_drain(&sc->sc_md6_tmo); #endif CARP_LOCK_DESTROY(sc); free(sc->sc_ifas, M_CARP); free(sc, M_CARP); } static struct carp_if* carp_alloc_if(struct ifnet *ifp) { struct carp_if *cif; int error; cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); if ((error = ifpromisc(ifp, 1)) != 0) printf("%s: ifpromisc(%s) failed: %d\n", __func__, ifp->if_xname, error); else cif->cif_flags |= CIF_PROMISC; CIF_LOCK_INIT(cif); cif->cif_ifp = ifp; TAILQ_INIT(&cif->cif_vrs); IF_ADDR_WLOCK(ifp); ifp->if_carp = cif; if_ref(ifp); IF_ADDR_WUNLOCK(ifp); return (cif); } static void carp_free_if(struct carp_if *cif) { struct ifnet *ifp = cif->cif_ifp; CIF_LOCK_ASSERT(cif); KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", __func__)); IF_ADDR_WLOCK(ifp); ifp->if_carp = NULL; IF_ADDR_WUNLOCK(ifp); CIF_LOCK_DESTROY(cif); if (cif->cif_flags & CIF_PROMISC) ifpromisc(ifp, 0); if_rele(ifp); free(cif, M_CARP); } static void carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv) { CARP_LOCK(sc); carpr->carpr_state = sc->sc_state; carpr->carpr_vhid = sc->sc_vhid; carpr->carpr_advbase = sc->sc_advbase; carpr->carpr_advskew = sc->sc_advskew; if (priv) bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); else bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); CARP_UNLOCK(sc); } int carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) { struct carpreq carpr; struct ifnet *ifp; struct carp_softc *sc = NULL; int error = 0, locked = 0; if ((error = copyin(ifr_data_get_ptr(ifr), &carpr, sizeof carpr))) return (error); ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) return (ENXIO); switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: break; default: error = EOPNOTSUPP; goto out; } if ((ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } sx_xlock(&carp_sx); switch (cmd) { case SIOCSVH: if ((error = priv_check(td, PRIV_NETINET_CARP))) break; if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID || carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) { error = EINVAL; break; } if (ifp->if_carp) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr.carpr_vhid) break; } if (sc == NULL) { sc = carp_alloc(ifp); CARP_LOCK(sc); sc->sc_vhid = carpr.carpr_vhid; LLADDR(&sc->sc_addr)[0] = 0; LLADDR(&sc->sc_addr)[1] = 0; LLADDR(&sc->sc_addr)[2] = 0x5e; LLADDR(&sc->sc_addr)[3] = 0; LLADDR(&sc->sc_addr)[4] = 1; LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; } else CARP_LOCK(sc); locked = 1; if (carpr.carpr_advbase > 0) { if (carpr.carpr_advbase > 255 || carpr.carpr_advbase < CARP_DFLTINTV) { error = EINVAL; break; } sc->sc_advbase = carpr.carpr_advbase; } if (carpr.carpr_advskew >= 255) { error = EINVAL; break; } sc->sc_advskew = carpr.carpr_advskew; if (carpr.carpr_key[0] != '\0') { bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); carp_hmac_prepare(sc); } if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { switch (carpr.carpr_state) { case BACKUP: callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "user requested via ifconfig"); carp_setrun(sc, 0); carp_delroute(sc); break; case MASTER: carp_master_down_locked(sc, "user requested via ifconfig"); break; default: break; } } break; case SIOCGVH: { int priveleged; if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) { error = EINVAL; break; } if (carpr.carpr_count < 1) { error = EMSGSIZE; break; } if (ifp->if_carp == NULL) { error = ENOENT; break; } priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0); if (carpr.carpr_vhid != 0) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr.carpr_vhid) break; if (sc == NULL) { error = ENOENT; break; } carp_carprcp(&carpr, sc, priveleged); error = copyout(&carpr, ifr_data_get_ptr(ifr), sizeof(carpr)); } else { int i, count; count = 0; IFNET_FOREACH_CARP(ifp, sc) count++; if (count > carpr.carpr_count) { CIF_UNLOCK(ifp->if_carp); error = EMSGSIZE; break; } i = 0; IFNET_FOREACH_CARP(ifp, sc) { carp_carprcp(&carpr, sc, priveleged); carpr.carpr_count = count; error = copyout(&carpr, (char *)ifr_data_get_ptr(ifr) + (i * sizeof(carpr)), sizeof(carpr)); if (error) { CIF_UNLOCK(ifp->if_carp); break; } i++; } } break; } default: error = EINVAL; } sx_xunlock(&carp_sx); out: if (locked) CARP_UNLOCK(sc); if_rele(ifp); return (error); } static int carp_get_vhid(struct ifaddr *ifa) { if (ifa == NULL || ifa->ifa_carp == NULL) return (0); return (ifa->ifa_carp->sc_vhid); } int carp_attach(struct ifaddr *ifa, int vhid) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc; int index, error; KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa)); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif break; default: return (EPROTOTYPE); } sx_xlock(&carp_sx); if (ifp->if_carp == NULL) { sx_xunlock(&carp_sx); return (ENOPROTOOPT); } IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == vhid) break; if (sc == NULL) { sx_xunlock(&carp_sx); return (ENOENT); } error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family); if (error) { CIF_FREE(cif); sx_xunlock(&carp_sx); return (error); } index = sc->sc_naddrs + sc->sc_naddrs6 + 1; if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) carp_grow_ifas(sc); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs++; sc->sc_naddrs++; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6++; sc->sc_naddrs6++; break; #endif } ifa_ref(ifa); CARP_LOCK(sc); sc->sc_ifas[index - 1] = ifa; ifa->ifa_carp = sc; carp_hmac_prepare(sc); carp_sc_state(sc); CARP_UNLOCK(sc); sx_xunlock(&carp_sx); return (0); } void carp_detach(struct ifaddr *ifa, bool keep_cif) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc = ifa->ifa_carp; int i, index; KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); sx_xlock(&carp_sx); CARP_LOCK(sc); /* Shift array. */ index = sc->sc_naddrs + sc->sc_naddrs6; for (i = 0; i < index; i++) if (sc->sc_ifas[i] == ifa) break; KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); for (; i < index - 1; i++) sc->sc_ifas[i] = sc->sc_ifas[i+1]; sc->sc_ifas[index - 1] = NULL; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs--; sc->sc_naddrs--; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6--; sc->sc_naddrs6--; break; #endif } carp_ifa_delroute(ifa); carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); ifa->ifa_carp = NULL; ifa_free(ifa); carp_hmac_prepare(sc); carp_sc_state(sc); if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) carp_destroy(sc); else CARP_UNLOCK(sc); if (!keep_cif) CIF_FREE(cif); sx_xunlock(&carp_sx); } static void carp_set_state(struct carp_softc *sc, int state, const char *reason) { CARP_LOCK_ASSERT(sc); if (sc->sc_state != state) { const char *carp_states[] = { CARP_STATES }; char subsys[IFNAMSIZ+5]; snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, sc->sc_carpdev->if_xname); CARP_LOG("%s: %s -> %s (%s)\n", subsys, carp_states[sc->sc_state], carp_states[state], reason); sc->sc_state = state; devctl_notify("CARP", subsys, carp_states[state], NULL); } } static void carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); carp_sc_state(sc); CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); } static void carp_sc_state(struct carp_softc *sc) { CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP) || !V_carp_allow) { callout_stop(&sc->sc_ad_tmo); #ifdef INET callout_stop(&sc->sc_md_tmo); #endif #ifdef INET6 callout_stop(&sc->sc_md6_tmo); #endif carp_set_state(sc, INIT, "hardware interface down"); carp_setrun(sc, 0); if (!sc->sc_suppress) carp_demote_adj(V_carp_ifdown_adj, "interface down"); sc->sc_suppress = 1; } else { carp_set_state(sc, INIT, "hardware interface up"); carp_setrun(sc, 0); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "interface up"); sc->sc_suppress = 0; } } static void carp_demote_adj(int adj, char *reason) { atomic_add_int(&V_carp_demotion, adj); CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason); taskqueue_enqueue(taskqueue_swi, &carp_sendall_task); } static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; struct carp_softc *sc; new = V_carp_allow; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); if (V_carp_allow != new) { V_carp_allow = new; mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) { CARP_LOCK(sc); if (curvnet == sc->sc_carpdev->if_vnet) carp_sc_state(sc); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); } return (0); } static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_dscp; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); if (new < 0 || new > 63) return (EINVAL); V_carp_dscp = new; return (0); } static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_demotion; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); carp_demote_adj(new, "sysctl"); return (0); } #ifdef INET extern struct domain inetdomain; static struct protosw in_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; #endif #ifdef INET6 extern struct domain inet6domain; static struct protosw in6_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }; #endif static void carp_mod_cleanup(void) { #ifdef INET if (proto_reg[CARP_INET] == 0) { (void)ipproto_unregister(IPPROTO_CARP); pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW); proto_reg[CARP_INET] = -1; } carp_iamatch_p = NULL; #endif #ifdef INET6 if (proto_reg[CARP_INET6] == 0) { (void)ip6proto_unregister(IPPROTO_CARP); pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW); proto_reg[CARP_INET6] = -1; } carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif carp_ioctl_p = NULL; carp_attach_p = NULL; carp_detach_p = NULL; carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; carp_demote_adj_p = NULL; carp_master_p = NULL; mtx_unlock(&carp_mtx); taskqueue_drain(taskqueue_swi, &carp_sendall_task); mtx_destroy(&carp_mtx); sx_destroy(&carp_sx); } static int carp_mod_load(void) { int err; mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); sx_init(&carp_sx, "carp_sx"); LIST_INIT(&carp_list); carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; carp_linkstate_p = carp_linkstate; carp_ioctl_p = carp_ioctl; carp_attach_p = carp_attach; carp_detach_p = carp_detach; carp_demote_adj_p = carp_demote_adj; carp_master_p = carp_master; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; proto_reg[CARP_INET6] = pf_proto_register(PF_INET6, (struct protosw *)&in6_carp_protosw); if (proto_reg[CARP_INET6]) { printf("carp: error %d attaching to PF_INET6\n", proto_reg[CARP_INET6]); carp_mod_cleanup(); return (proto_reg[CARP_INET6]); } err = ip6proto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET6\n", err); carp_mod_cleanup(); return (err); } #endif #ifdef INET carp_iamatch_p = carp_iamatch; proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw); if (proto_reg[CARP_INET]) { printf("carp: error %d attaching to PF_INET\n", proto_reg[CARP_INET]); carp_mod_cleanup(); return (proto_reg[CARP_INET]); } err = ipproto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET\n", err); carp_mod_cleanup(); return (err); } #endif return (0); } static int carp_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: mtx_lock(&carp_mtx); if (LIST_EMPTY(&carp_list)) carp_mod_cleanup(); else { mtx_unlock(&carp_mtx); return (EBUSY); } break; default: return (EINVAL); } return (0); } static moduledata_t carp_mod = { "carp", carp_modevent, 0 }; DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);