diff --git a/sys/cam/scsi/scsi_cd.c b/sys/cam/scsi/scsi_cd.c index b9d5d1a76b9c..766e9c99f006 100644 --- a/sys/cam/scsi/scsi_cd.c +++ b/sys/cam/scsi/scsi_cd.c @@ -1,4262 +1,4261 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997 Justin T. Gibbs. * Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Portions of this driver taken from the original FreeBSD cd driver. * Written by Julian Elischer (julian@tfs.com) * for TRW Financial Systems for use under the MACH(2.5) operating system. * * TRW Financial Systems, in accordance with their agreement with Carnegie * Mellon University, makes this software available to CMU to distribute * or use in any manner that they see fit as long as this message is kept with * the software. For this reason TFS also grants any other persons or * organisations permission to use or modify this software. * * TFS supplies this software to be publicly redistributed * on the understanding that TFS is not responsible for the correct * functioning of this software in any circumstances. * * Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992 * * from: cd.c,v 1.83 1997/05/04 15:24:22 joerg Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_cd.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #define LEADOUT 0xaa /* leadout toc entry */ struct cd_params { u_int32_t blksize; u_long disksize; }; typedef enum { CD_Q_NONE = 0x00, CD_Q_NO_TOUCH = 0x01, CD_Q_BCD_TRACKS = 0x02, CD_Q_10_BYTE_ONLY = 0x10, CD_Q_RETRY_BUSY = 0x40 } cd_quirks; #define CD_Q_BIT_STRING \ "\020" \ "\001NO_TOUCH" \ "\002BCD_TRACKS" \ "\00510_BYTE_ONLY" \ "\007RETRY_BUSY" typedef enum { CD_FLAG_INVALID = 0x0001, CD_FLAG_NEW_DISC = 0x0002, CD_FLAG_DISC_LOCKED = 0x0004, CD_FLAG_DISC_REMOVABLE = 0x0008, CD_FLAG_SAW_MEDIA = 0x0010, CD_FLAG_ACTIVE = 0x0080, CD_FLAG_SCHED_ON_COMP = 0x0100, CD_FLAG_RETRY_UA = 0x0200, CD_FLAG_VALID_MEDIA = 0x0400, CD_FLAG_VALID_TOC = 0x0800, CD_FLAG_SCTX_INIT = 0x1000, CD_FLAG_MEDIA_WAIT = 0x2000, CD_FLAG_MEDIA_SCAN_ACT = 0x4000 } cd_flags; typedef enum { CD_CCB_PROBE = 0x01, CD_CCB_BUFFER_IO = 0x02, CD_CCB_TUR = 0x03, CD_CCB_MEDIA_PREVENT = 0x04, CD_CCB_MEDIA_ALLOW = 0x05, CD_CCB_MEDIA_SIZE = 0x06, CD_CCB_MEDIA_TOC_HDR = 0x07, CD_CCB_MEDIA_TOC_FULL = 0x08, CD_CCB_MEDIA_TOC_LEAD = 0x09, CD_CCB_TYPE_MASK = 0x0F, CD_CCB_RETRY_UA = 0x10 } cd_ccb_state; #define ccb_state ppriv_field0 #define ccb_bp ppriv_ptr1 /* * According to the MMC-6 spec, 6.25.3.2.11, the lead-out is reported by * READ_TOC as logical track 170, so at most 169 tracks may be reported. */ struct cd_tocdata { struct ioc_toc_header header; struct cd_toc_entry entries[170]; }; struct cd_toc_single { struct ioc_toc_header header; struct cd_toc_entry entry; }; typedef enum { CD_STATE_PROBE, CD_STATE_NORMAL, CD_STATE_MEDIA_PREVENT, CD_STATE_MEDIA_ALLOW, CD_STATE_MEDIA_SIZE, CD_STATE_MEDIA_TOC_HDR, CD_STATE_MEDIA_TOC_FULL, CD_STATE_MEDIA_TOC_LEAD } cd_state; struct cd_softc { cam_pinfo pinfo; cd_state state; volatile cd_flags flags; struct bio_queue_head bio_queue; LIST_HEAD(, ccb_hdr) pending_ccbs; struct cd_params params; union ccb saved_ccb; cd_quirks quirks; struct cam_periph *periph; int minimum_command_size; int outstanding_cmds; int tur; struct task sysctl_task; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; STAILQ_HEAD(, cd_mode_params) mode_queue; struct cd_tocdata toc; int toc_read_len; struct cd_toc_single leadout; struct disk *disk; struct callout mediapoll_c; #define CD_ANNOUNCETMP_SZ 120 char announce_temp[CD_ANNOUNCETMP_SZ]; #define CD_ANNOUNCE_SZ 400 char announce_buf[CD_ANNOUNCE_SZ]; }; struct cd_page_sizes { int page; int page_size; }; static struct cd_page_sizes cd_page_size_table[] = { { AUDIO_PAGE, sizeof(struct cd_audio_page)} }; struct cd_quirk_entry { struct scsi_inquiry_pattern inq_pat; cd_quirks quirks; }; /* * NOTE ON 10_BYTE_ONLY quirks: Any 10_BYTE_ONLY quirks MUST be because * your device hangs when it gets a 10 byte command. Adding a quirk just * to get rid of the informative diagnostic message is not acceptable. All * 10_BYTE_ONLY quirks must be documented in full in a PR (which should be * referenced in a comment along with the quirk) , and must be approved by * ken@FreeBSD.org. Any quirks added that don't adhere to this policy may * be removed until the submitter can explain why they are needed. * 10_BYTE_ONLY quirks will be removed (as they will no longer be necessary) * when the CAM_NEW_TRAN_CODE work is done. */ static struct cd_quirk_entry cd_quirk_table[] = { { { T_CDROM, SIP_MEDIA_REMOVABLE, "CHINON", "CD-ROM CDS-535","*"}, /* quirks */ CD_Q_BCD_TRACKS }, { /* * VMware returns BUSY status when storage has transient * connectivity problems, so better wait. */ {T_CDROM, SIP_MEDIA_REMOVABLE, "NECVMWar", "VMware IDE CDR10", "*"}, /*quirks*/ CD_Q_RETRY_BUSY } }; #ifdef COMPAT_FREEBSD32 struct ioc_read_toc_entry32 { u_char address_format; u_char starting_track; u_short data_len; uint32_t data; /* (struct cd_toc_entry *) */ }; #define CDIOREADTOCENTRYS_32 \ _IOC_NEWTYPE(CDIOREADTOCENTRYS, struct ioc_read_toc_entry32) #endif static disk_open_t cdopen; static disk_close_t cdclose; static disk_ioctl_t cdioctl; static disk_strategy_t cdstrategy; static periph_init_t cdinit; static periph_ctor_t cdregister; static periph_dtor_t cdcleanup; static periph_start_t cdstart; static periph_oninv_t cdoninvalidate; static void cdasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static int cdcmdsizesysctl(SYSCTL_HANDLER_ARGS); static int cdrunccb(union ccb *ccb, int (*error_routine)(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags), u_int32_t cam_flags, u_int32_t sense_flags); static void cddone(struct cam_periph *periph, union ccb *start_ccb); static union cd_pages *cdgetpage(struct cd_mode_params *mode_params); static int cdgetpagesize(int page_num); static void cdprevent(struct cam_periph *periph, int action); static void cdmediaprobedone(struct cam_periph *periph); static int cdcheckmedia(struct cam_periph *periph, int do_wait); #if 0 static int cdsize(struct cam_periph *periph, u_int32_t *size); #endif static int cd6byteworkaround(union ccb *ccb); static int cderror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static int cdreadtoc(struct cam_periph *periph, u_int32_t mode, u_int32_t start, u_int8_t *data, u_int32_t len, u_int32_t sense_flags); static int cdgetmode(struct cam_periph *periph, struct cd_mode_params *data, u_int32_t page); static int cdsetmode(struct cam_periph *periph, struct cd_mode_params *data); static int cdplay(struct cam_periph *periph, u_int32_t blk, u_int32_t len); static int cdreadsubchannel(struct cam_periph *periph, u_int32_t mode, u_int32_t format, int track, struct cd_sub_channel_info *data, u_int32_t len); static int cdplaymsf(struct cam_periph *periph, u_int32_t startm, u_int32_t starts, u_int32_t startf, u_int32_t endm, u_int32_t ends, u_int32_t endf); static int cdplaytracks(struct cam_periph *periph, u_int32_t strack, u_int32_t sindex, u_int32_t etrack, u_int32_t eindex); static int cdpause(struct cam_periph *periph, u_int32_t go); static int cdstopunit(struct cam_periph *periph, u_int32_t eject); static int cdstartunit(struct cam_periph *periph, int load); static int cdsetspeed(struct cam_periph *periph, u_int32_t rdspeed, u_int32_t wrspeed); static int cdreportkey(struct cam_periph *periph, struct dvd_authinfo *authinfo); static int cdsendkey(struct cam_periph *periph, struct dvd_authinfo *authinfo); static int cdreaddvdstructure(struct cam_periph *periph, struct dvd_struct *dvdstruct); static callout_func_t cdmediapoll; static struct periph_driver cddriver = { cdinit, "cd", TAILQ_HEAD_INITIALIZER(cddriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(cd, cddriver); #ifndef CD_DEFAULT_POLL_PERIOD #define CD_DEFAULT_POLL_PERIOD 3 #endif #ifndef CD_DEFAULT_RETRY #define CD_DEFAULT_RETRY 4 #endif #ifndef CD_DEFAULT_TIMEOUT #define CD_DEFAULT_TIMEOUT 30000 #endif static int cd_poll_period = CD_DEFAULT_POLL_PERIOD; static int cd_retry_count = CD_DEFAULT_RETRY; static int cd_timeout = CD_DEFAULT_TIMEOUT; static SYSCTL_NODE(_kern_cam, OID_AUTO, cd, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "CAM CDROM driver"); SYSCTL_INT(_kern_cam_cd, OID_AUTO, poll_period, CTLFLAG_RWTUN, &cd_poll_period, 0, "Media polling period in seconds"); SYSCTL_INT(_kern_cam_cd, OID_AUTO, retry_count, CTLFLAG_RWTUN, &cd_retry_count, 0, "Normal I/O retry count"); SYSCTL_INT(_kern_cam_cd, OID_AUTO, timeout, CTLFLAG_RWTUN, &cd_timeout, 0, "Timeout, in us, for read operations"); static MALLOC_DEFINE(M_SCSICD, "scsi_cd", "scsi_cd buffers"); static void cdinit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, cdasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("cd: Failed to attach master async callback " "due to status 0x%x!\n", status); } } /* * Callback from GEOM, called when it has finished cleaning up its * resources. */ static void cddiskgonecb(struct disk *dp) { struct cam_periph *periph; periph = (struct cam_periph *)dp->d_drv1; cam_periph_release(periph); } static void cdoninvalidate(struct cam_periph *periph) { struct cd_softc *softc; cam_periph_assert(periph, MA_OWNED); softc = (struct cd_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, cdasync, periph, periph->path); softc->flags |= CD_FLAG_INVALID; /* * Return all queued I/O with ENXIO. * XXX Handle any transactions queued to the card * with XPT_ABORT_CCB. */ bioq_flush(&softc->bio_queue, NULL, ENXIO); disk_gone(softc->disk); } static void cdcleanup(struct cam_periph *periph) { struct cd_softc *softc; softc = (struct cd_softc *)periph->softc; cam_periph_unlock(periph); if ((softc->flags & CD_FLAG_SCTX_INIT) != 0 && sysctl_ctx_free(&softc->sysctl_ctx) != 0) { xpt_print(periph->path, "can't remove sysctl context\n"); } callout_drain(&softc->mediapoll_c); disk_destroy(softc->disk); free(softc, M_DEVBUF); cam_periph_lock(periph); } static void cdasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; struct cd_softc *softc; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; if (cgd->protocol != PROTO_SCSI) break; if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED) break; if (SID_TYPE(&cgd->inq_data) != T_CDROM && SID_TYPE(&cgd->inq_data) != T_WORM) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(cdregister, cdoninvalidate, cdcleanup, cdstart, "cd", CAM_PERIPH_BIO, path, cdasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) printf("cdasync: Unable to attach new device " "due to status 0x%x\n", status); return; } case AC_UNIT_ATTENTION: { union ccb *ccb; int error_code, sense_key, asc, ascq; softc = (struct cd_softc *)periph->softc; ccb = (union ccb *)arg; /* * Handle all media change UNIT ATTENTIONs except * our own, as they will be handled by cderror(). */ if (xpt_path_periph(ccb->ccb_h.path) != periph && scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq)) { if (asc == 0x28 && ascq == 0x00) disk_media_changed(softc->disk, M_NOWAIT); } break; } case AC_SCSI_AEN: cam_periph_assert(periph, MA_OWNED); softc = (struct cd_softc *)periph->softc; if (softc->state == CD_STATE_NORMAL && !softc->tur) { if (cam_periph_acquire(periph) == 0) { softc->tur = 1; xpt_schedule(periph, CAM_PRIORITY_NORMAL); } } /* FALLTHROUGH */ case AC_SENT_BDR: case AC_BUS_RESET: { struct ccb_hdr *ccbh; cam_periph_assert(periph, MA_OWNED); softc = (struct cd_softc *)periph->softc; /* * Don't fail on the expected unit attention * that will occur. */ softc->flags |= CD_FLAG_RETRY_UA; LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le) ccbh->ccb_state |= CD_CCB_RETRY_UA; break; } default: break; } cam_periph_async(periph, code, path, arg); } static void cdsysctlinit(void *context, int pending) { struct cam_periph *periph; struct cd_softc *softc; char tmpstr[32], tmpstr2[16]; periph = (struct cam_periph *)context; if (cam_periph_acquire(periph) != 0) return; softc = (struct cd_softc *)periph->softc; snprintf(tmpstr, sizeof(tmpstr), "CAM CD unit %d", periph->unit_number); snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number); sysctl_ctx_init(&softc->sysctl_ctx); cam_periph_lock(periph); softc->flags |= CD_FLAG_SCTX_INIT; cam_periph_unlock(periph); softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam_cd), OID_AUTO, tmpstr2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, tmpstr, "device_index"); if (softc->sysctl_tree == NULL) { printf("cdsysctlinit: unable to allocate sysctl tree\n"); cam_periph_release(periph); return; } /* * Now register the sysctl handler, so the user can the value on * the fly. */ SYSCTL_ADD_PROC(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "minimum_cmd_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &softc->minimum_command_size, 0, cdcmdsizesysctl, "I", "Minimum CDB size"); cam_periph_release(periph); } /* * We have a handler function for this so we can check the values when the * user sets them, instead of every time we look at them. */ static int cdcmdsizesysctl(SYSCTL_HANDLER_ARGS) { int error, value; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); /* * The only real values we can have here are 6 or 10. I don't * really forsee having 12 be an option at any time in the future. * So if the user sets something less than or equal to 6, we'll set * it to 6. If he sets something greater than 6, we'll set it to 10. * * I suppose we could just return an error here for the wrong values, * but I don't think it's necessary to do so, as long as we can * determine the user's intent without too much trouble. */ if (value < 6) value = 6; else if (value > 6) value = 10; *(int *)arg1 = value; return (0); } static cam_status cdregister(struct cam_periph *periph, void *arg) { struct cd_softc *softc; struct ccb_pathinq cpi; struct ccb_getdev *cgd; char tmpstr[80]; caddr_t match; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { printf("cdregister: no getdev CCB, can't register device\n"); return(CAM_REQ_CMP_ERR); } softc = (struct cd_softc *)malloc(sizeof(*softc),M_DEVBUF, M_NOWAIT | M_ZERO); if (softc == NULL) { printf("cdregister: Unable to probe new device. " "Unable to allocate softc\n"); return(CAM_REQ_CMP_ERR); } LIST_INIT(&softc->pending_ccbs); STAILQ_INIT(&softc->mode_queue); softc->state = CD_STATE_PROBE; bioq_init(&softc->bio_queue); if (SID_IS_REMOVABLE(&cgd->inq_data)) softc->flags |= CD_FLAG_DISC_REMOVABLE; periph->softc = softc; softc->periph = periph; /* * See if this device has any quirks. */ match = cam_quirkmatch((caddr_t)&cgd->inq_data, (caddr_t)cd_quirk_table, nitems(cd_quirk_table), sizeof(*cd_quirk_table), scsi_inquiry_match); if (match != NULL) softc->quirks = ((struct cd_quirk_entry *)match)->quirks; else softc->quirks = CD_Q_NONE; /* Check if the SIM does not want 6 byte commands */ xpt_path_inq(&cpi, periph->path); if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE)) softc->quirks |= CD_Q_10_BYTE_ONLY; TASK_INIT(&softc->sysctl_task, 0, cdsysctlinit, periph); /* The default is 6 byte commands, unless quirked otherwise */ if (softc->quirks & CD_Q_10_BYTE_ONLY) softc->minimum_command_size = 10; else softc->minimum_command_size = 6; /* * Refcount and block open attempts until we are setup * Can't block */ (void)cam_periph_hold(periph, PRIBIO); cam_periph_unlock(periph); /* * Load the user's default, if any. */ snprintf(tmpstr, sizeof(tmpstr), "kern.cam.cd.%d.minimum_cmd_size", periph->unit_number); TUNABLE_INT_FETCH(tmpstr, &softc->minimum_command_size); /* 6 and 10 are the only permissible values here. */ if (softc->minimum_command_size < 6) softc->minimum_command_size = 6; else if (softc->minimum_command_size > 6) softc->minimum_command_size = 10; /* * We need to register the statistics structure for this device, * but we don't have the blocksize yet for it. So, we register * the structure and indicate that we don't have the blocksize * yet. Unlike other SCSI peripheral drivers, we explicitly set * the device type here to be CDROM, rather than just ORing in * the device type. This is because this driver can attach to either * CDROM or WORM devices, and we want this peripheral driver to * show up in the devstat list as a CD peripheral driver, not a * WORM peripheral driver. WORM drives will also have the WORM * driver attached to them. */ softc->disk = disk_alloc(); softc->disk->d_devstat = devstat_new_entry("cd", periph->unit_number, 0, DEVSTAT_BS_UNAVAILABLE, DEVSTAT_TYPE_CDROM | XPORT_DEVSTAT_TYPE(cpi.transport), DEVSTAT_PRIORITY_CD); softc->disk->d_open = cdopen; softc->disk->d_close = cdclose; softc->disk->d_strategy = cdstrategy; softc->disk->d_gone = cddiskgonecb; softc->disk->d_ioctl = cdioctl; softc->disk->d_name = "cd"; cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor, sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr)); strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr)); cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)], cgd->inq_data.product, sizeof(cgd->inq_data.product), sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr)); softc->disk->d_unit = periph->unit_number; softc->disk->d_drv1 = periph; if (cpi.maxio == 0) softc->disk->d_maxsize = DFLTPHYS; /* traditional default */ else if (cpi.maxio > maxphys) softc->disk->d_maxsize = maxphys; /* for safety */ else softc->disk->d_maxsize = cpi.maxio; softc->disk->d_flags = 0; softc->disk->d_hba_vendor = cpi.hba_vendor; softc->disk->d_hba_device = cpi.hba_device; softc->disk->d_hba_subvendor = cpi.hba_subvendor; softc->disk->d_hba_subdevice = cpi.hba_subdevice; snprintf(softc->disk->d_attachment, sizeof(softc->disk->d_attachment), "%s%d", cpi.dev_name, cpi.unit_number); /* * Acquire a reference to the periph before we register with GEOM. * We'll release this reference once GEOM calls us back (via * dadiskgonecb()) telling us that our provider has been freed. */ if (cam_periph_acquire(periph) != 0) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } disk_create(softc->disk, DISK_VERSION); cam_periph_lock(periph); /* * Add an async callback so that we get * notified if this device goes away. */ xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE | AC_SCSI_AEN | AC_UNIT_ATTENTION, cdasync, periph, periph->path); /* * Schedule a periodic media polling events. */ callout_init_mtx(&softc->mediapoll_c, cam_periph_mtx(periph), 0); if ((softc->flags & CD_FLAG_DISC_REMOVABLE) && (cgd->inq_flags & SID_AEN) == 0 && cd_poll_period != 0) { callout_reset_sbt(&softc->mediapoll_c, cd_poll_period * SBT_1S, 0, cdmediapoll, periph, C_PREL(1)); } xpt_schedule(periph, CAM_PRIORITY_DEV); return(CAM_REQ_CMP); } static int cdopen(struct disk *dp) { struct cam_periph *periph; struct cd_softc *softc; int error; periph = (struct cam_periph *)dp->d_drv1; softc = (struct cd_softc *)periph->softc; if (cam_periph_acquire(periph) != 0) return(ENXIO); cam_periph_lock(periph); if (softc->flags & CD_FLAG_INVALID) { cam_periph_release_locked(periph); cam_periph_unlock(periph); return(ENXIO); } if ((error = cam_periph_hold(periph, PRIBIO | PCATCH)) != 0) { cam_periph_release_locked(periph); cam_periph_unlock(periph); return (error); } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("cdopen\n")); /* * Check for media, and set the appropriate flags. We don't bail * if we don't have media, but then we don't allow anything but the * CDIOCEJECT/CDIOCCLOSE ioctls if there is no media. */ cdcheckmedia(periph, /*do_wait*/ 1); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("leaving cdopen\n")); cam_periph_unhold(periph); cam_periph_unlock(periph); return (0); } static int cdclose(struct disk *dp) { struct cam_periph *periph; struct cd_softc *softc; periph = (struct cam_periph *)dp->d_drv1; softc = (struct cd_softc *)periph->softc; cam_periph_lock(periph); if (cam_periph_hold(periph, PRIBIO) != 0) { cam_periph_unlock(periph); cam_periph_release(periph); return (0); } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("cdclose\n")); if ((softc->flags & CD_FLAG_DISC_REMOVABLE) != 0) cdprevent(periph, PR_ALLOW); /* * Since we're closing this CD, mark the blocksize as unavailable. * It will be marked as available when the CD is opened again. */ softc->disk->d_devstat->flags |= DEVSTAT_BS_UNAVAILABLE; /* * We'll check the media and toc again at the next open(). */ softc->flags &= ~(CD_FLAG_VALID_MEDIA|CD_FLAG_VALID_TOC); cam_periph_unhold(periph); cam_periph_release_locked(periph); cam_periph_unlock(periph); return (0); } static int cdrunccb(union ccb *ccb, int (*error_routine)(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags), u_int32_t cam_flags, u_int32_t sense_flags) { struct cd_softc *softc; struct cam_periph *periph; int error; periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct cd_softc *)periph->softc; error = cam_periph_runccb(ccb, error_routine, cam_flags, sense_flags, softc->disk->d_devstat); return(error); } /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include * only one physical transfer. */ static void cdstrategy(struct bio *bp) { struct cam_periph *periph; struct cd_softc *softc; periph = (struct cam_periph *)bp->bio_disk->d_drv1; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("cdstrategy(%p)\n", bp)); softc = (struct cd_softc *)periph->softc; /* * If the device has been made invalid, error out */ if ((softc->flags & CD_FLAG_INVALID)) { cam_periph_unlock(periph); biofinish(bp, NULL, ENXIO); return; } /* * Place it in the queue of disk activities for this disk */ bioq_disksort(&softc->bio_queue, bp); /* * If we don't know that we have valid media, schedule the media * check first. The I/O will get executed after the media check. */ if ((softc->flags & CD_FLAG_VALID_MEDIA) == 0) cdcheckmedia(periph, /*do_wait*/ 0); else xpt_schedule(periph, CAM_PRIORITY_NORMAL); cam_periph_unlock(periph); return; } static void cdstart(struct cam_periph *periph, union ccb *start_ccb) { struct cd_softc *softc; struct bio *bp; struct ccb_scsiio *csio; cam_periph_assert(periph, MA_OWNED); softc = (struct cd_softc *)periph->softc; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering cdstart\n")); switch (softc->state) { case CD_STATE_NORMAL: { bp = bioq_first(&softc->bio_queue); if (bp == NULL) { if (softc->tur) { softc->tur = 0; csio = &start_ccb->csio; scsi_test_unit_ready(csio, /*retries*/ cd_retry_count, cddone, MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, cd_timeout); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = CD_CCB_TUR; xpt_action(start_ccb); } else xpt_release_ccb(start_ccb); } else { if (softc->tur) { softc->tur = 0; cam_periph_release_locked(periph); } bioq_remove(&softc->bio_queue, bp); if ((bp->bio_cmd != BIO_READ) && (bp->bio_cmd != BIO_WRITE)) { biofinish(bp, NULL, EOPNOTSUPP); xpt_release_ccb(start_ccb); return; } scsi_read_write(&start_ccb->csio, /*retries*/ cd_retry_count, /* cbfcnp */ cddone, MSG_SIMPLE_Q_TAG, /* read */bp->bio_cmd == BIO_READ ? SCSI_RW_READ : SCSI_RW_WRITE, /* byte2 */ 0, /* minimum_cmd_size */ 10, /* lba */ bp->bio_offset / softc->params.blksize, bp->bio_bcount / softc->params.blksize, /* data_ptr */ bp->bio_data, /* dxfer_len */ bp->bio_bcount, /* sense_len */ cd_retry_count ? SSD_FULL_SIZE : SF_NO_PRINT, /* timeout */ cd_timeout); /* Use READ CD command for audio tracks. */ if (softc->params.blksize == 2352) { start_ccb->csio.cdb_io.cdb_bytes[0] = READ_CD; start_ccb->csio.cdb_io.cdb_bytes[9] = 0xf8; start_ccb->csio.cdb_io.cdb_bytes[10] = 0; start_ccb->csio.cdb_io.cdb_bytes[11] = 0; start_ccb->csio.cdb_len = 12; } start_ccb->ccb_h.ccb_state = CD_CCB_BUFFER_IO; LIST_INSERT_HEAD(&softc->pending_ccbs, &start_ccb->ccb_h, periph_links.le); softc->outstanding_cmds++; /* We expect a unit attention from this device */ if ((softc->flags & CD_FLAG_RETRY_UA) != 0) { start_ccb->ccb_h.ccb_state |= CD_CCB_RETRY_UA; softc->flags &= ~CD_FLAG_RETRY_UA; } start_ccb->ccb_h.ccb_bp = bp; bp = bioq_first(&softc->bio_queue); xpt_action(start_ccb); } if (bp != NULL || softc->tur) { /* Have more work to do, so ensure we stay scheduled */ xpt_schedule(periph, CAM_PRIORITY_NORMAL); } break; } case CD_STATE_PROBE: case CD_STATE_MEDIA_SIZE: { struct scsi_read_capacity_data *rcap; rcap = (struct scsi_read_capacity_data *)malloc(sizeof(*rcap), M_SCSICD, M_NOWAIT | M_ZERO); if (rcap == NULL) { xpt_print(periph->path, "%s: Couldn't malloc read_capacity data\n", __func__); xpt_release_ccb(start_ccb); /* * We can't probe because we can't allocate memory, * so invalidate the peripheral. The system probably * has larger problems at this stage. If we've * already probed (and are re-probing capacity), we * don't need to invalidate. * * XXX KDM need to reset probe state and kick out * pending I/O. */ if (softc->state == CD_STATE_PROBE) cam_periph_invalidate(periph); break; } /* * Set the default capacity and sector size to something that * GEOM can handle. This will get reset when a read capacity * completes successfully. */ softc->disk->d_sectorsize = 2048; softc->disk->d_mediasize = 0; csio = &start_ccb->csio; scsi_read_capacity(csio, /*retries*/ cd_retry_count, cddone, MSG_SIMPLE_Q_TAG, rcap, SSD_FULL_SIZE, /*timeout*/20000); start_ccb->ccb_h.ccb_bp = NULL; if (softc->state == CD_STATE_PROBE) start_ccb->ccb_h.ccb_state = CD_CCB_PROBE; else start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_SIZE; xpt_action(start_ccb); break; } case CD_STATE_MEDIA_ALLOW: case CD_STATE_MEDIA_PREVENT: { /* * If the CD is already locked, we don't need to do this. * Move on to the capacity check. */ if (softc->state == CD_STATE_MEDIA_PREVENT && (softc->flags & CD_FLAG_DISC_LOCKED) != 0) { softc->state = CD_STATE_MEDIA_SIZE; xpt_release_ccb(start_ccb); xpt_schedule(periph, CAM_PRIORITY_NORMAL); break; } scsi_prevent(&start_ccb->csio, /*retries*/ cd_retry_count, /*cbfcnp*/ cddone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*action*/ (softc->state == CD_STATE_MEDIA_ALLOW) ? PR_ALLOW : PR_PREVENT, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ 60000); start_ccb->ccb_h.ccb_bp = NULL; if (softc->state == CD_STATE_MEDIA_ALLOW) start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_ALLOW; else start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_PREVENT; xpt_action(start_ccb); break; } case CD_STATE_MEDIA_TOC_HDR: { struct ioc_toc_header *toch; bzero(&softc->toc, sizeof(softc->toc)); toch = &softc->toc.header; scsi_read_toc(&start_ccb->csio, /*retries*/ cd_retry_count, /*cbfcnp*/ cddone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*byte1_flags*/ 0, /*format*/ SRTOC_FORMAT_TOC, /*track*/ 0, /*data_ptr*/ (uint8_t *)toch, /*dxfer_len*/ sizeof(*toch), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ 50000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_TOC_HDR; xpt_action(start_ccb); break; } case CD_STATE_MEDIA_TOC_FULL: { bzero(&softc->toc, sizeof(softc->toc)); scsi_read_toc(&start_ccb->csio, /*retries*/ cd_retry_count, /*cbfcnp*/ cddone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*byte1_flags*/ 0, /*format*/ SRTOC_FORMAT_TOC, /*track*/ 0, /*data_ptr*/ (uint8_t *)&softc->toc, /*dxfer_len*/ softc->toc_read_len ? softc->toc_read_len : sizeof(softc->toc), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ 50000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_TOC_FULL; xpt_action(start_ccb); break; } case CD_STATE_MEDIA_TOC_LEAD: { struct cd_toc_single *leadout; leadout = &softc->leadout; bzero(leadout, sizeof(*leadout)); scsi_read_toc(&start_ccb->csio, /*retries*/ cd_retry_count, /*cbfcnp*/ cddone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*byte1_flags*/ CD_MSF, /*format*/ SRTOC_FORMAT_TOC, /*track*/ LEADOUT, /*data_ptr*/ (uint8_t *)leadout, /*dxfer_len*/ sizeof(*leadout), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ 50000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = CD_CCB_MEDIA_TOC_LEAD; xpt_action(start_ccb); break; } } } static void cddone(struct cam_periph *periph, union ccb *done_ccb) { struct cd_softc *softc; struct ccb_scsiio *csio; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering cddone\n")); cam_periph_assert(periph, MA_OWNED); softc = (struct cd_softc *)periph->softc; csio = &done_ccb->csio; switch (csio->ccb_h.ccb_state & CD_CCB_TYPE_MASK) { case CD_CCB_BUFFER_IO: { struct bio *bp; int error; bp = (struct bio *)done_ccb->ccb_h.ccb_bp; error = 0; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { int sf; if ((done_ccb->ccb_h.ccb_state & CD_CCB_RETRY_UA) != 0) sf = SF_RETRY_UA; else sf = 0; error = cderror(done_ccb, CAM_RETRY_SELTO, sf); if (error == ERESTART) { /* * A retry was scheuled, so * just return. */ return; } } if (error != 0) { xpt_print(periph->path, "cddone: got error %#x back\n", error); bioq_flush(&softc->bio_queue, NULL, EIO); bp->bio_resid = bp->bio_bcount; bp->bio_error = error; bp->bio_flags |= BIO_ERROR; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } else { bp->bio_resid = csio->resid; bp->bio_error = 0; if (bp->bio_resid != 0) { /* * Short transfer ??? * XXX: not sure this is correct for partial * transfers at EOM */ bp->bio_flags |= BIO_ERROR; } } LIST_REMOVE(&done_ccb->ccb_h, periph_links.le); softc->outstanding_cmds--; biofinish(bp, NULL, 0); break; } case CD_CCB_PROBE: { struct scsi_read_capacity_data *rdcap; char *announce_buf; struct cd_params *cdp; int error; cdp = &softc->params; announce_buf = softc->announce_temp; bzero(announce_buf, CD_ANNOUNCETMP_SZ); rdcap = (struct scsi_read_capacity_data *)csio->data_ptr; cdp->disksize = scsi_4btoul (rdcap->addr) + 1; cdp->blksize = scsi_4btoul (rdcap->length); /* * Retry any UNIT ATTENTION type errors. They * are expected at boot. */ if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP || (error = cderror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_PRINT)) == 0) { snprintf(announce_buf, CD_ANNOUNCETMP_SZ, "%juMB (%ju %u byte sectors)", ((uintmax_t)cdp->disksize * cdp->blksize) / (1024 * 1024), (uintmax_t)cdp->disksize, cdp->blksize); } else { if (error == ERESTART) { /* * A retry was scheuled, so * just return. */ return; } else { int asc, ascq; int sense_key, error_code; int have_sense; cam_status status; struct ccb_getdev cgd; /* Don't wedge this device's queue */ if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); status = done_ccb->ccb_h.status; xpt_setup_ccb(&cgd.ccb_h, done_ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); if (scsi_extract_sense_ccb(done_ccb, &error_code, &sense_key, &asc, &ascq)) have_sense = TRUE; else have_sense = FALSE; /* * Attach to anything that claims to be a * CDROM or WORM device, as long as it * doesn't return a "Logical unit not * supported" (0x25) error. */ if ((have_sense) && (asc != 0x25) && (error_code == SSD_CURRENT_ERROR || error_code == SSD_DESC_CURRENT_ERROR)) { const char *sense_key_desc; const char *asc_desc; scsi_sense_desc(sense_key, asc, ascq, &cgd.inq_data, &sense_key_desc, &asc_desc); snprintf(announce_buf, CD_ANNOUNCETMP_SZ, "Attempt to query device " "size failed: %s, %s", sense_key_desc, asc_desc); } else if ((have_sense == 0) && ((status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR) && (csio->scsi_status == SCSI_STATUS_BUSY)) { snprintf(announce_buf, CD_ANNOUNCETMP_SZ, "Attempt to query device " "size failed: SCSI Status: %s", scsi_status_string(csio)); } else if (SID_TYPE(&cgd.inq_data) == T_CDROM) { /* * We only print out an error for * CDROM type devices. For WORM * devices, we don't print out an * error since a few WORM devices * don't support CDROM commands. * If we have sense information, go * ahead and print it out. * Otherwise, just say that we * couldn't attach. */ /* * Just print out the error, not * the full probe message, when we * don't attach. */ if (have_sense) scsi_sense_print( &done_ccb->csio); else { xpt_print(periph->path, "got CAM status %#x\n", done_ccb->ccb_h.status); } xpt_print(periph->path, "fatal error, " "failed to attach to device\n"); /* * Invalidate this peripheral. */ cam_periph_invalidate(periph); announce_buf = NULL; } else { /* * Invalidate this peripheral. */ cam_periph_invalidate(periph); announce_buf = NULL; } } } free(rdcap, M_SCSICD); if (announce_buf != NULL) { struct sbuf sb; sbuf_new(&sb, softc->announce_buf, CD_ANNOUNCE_SZ, SBUF_FIXEDLEN); xpt_announce_periph_sbuf(periph, &sb, announce_buf); xpt_announce_quirks_sbuf(periph, &sb, softc->quirks, CD_Q_BIT_STRING); sbuf_finish(&sb); sbuf_putbuf(&sb); /* * Create our sysctl variables, now that we know * we have successfully attached. */ taskqueue_enqueue(taskqueue_thread,&softc->sysctl_task); } softc->state = CD_STATE_NORMAL; /* * Since our peripheral may be invalidated by an error * above or an external event, we must release our CCB * before releasing the probe lock on the peripheral. * The peripheral will only go away once the last lock * is removed, and we need it around for the CCB release * operation. */ xpt_release_ccb(done_ccb); cam_periph_unhold(periph); return; } case CD_CCB_TUR: { if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (cderror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_RECOVERY | SF_NO_PRINT) == ERESTART) return; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } xpt_release_ccb(done_ccb); cam_periph_release_locked(periph); return; } case CD_CCB_MEDIA_ALLOW: case CD_CCB_MEDIA_PREVENT: { int error; int is_prevent; error = 0; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = cderror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_PRINT); } if (error == ERESTART) return; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); /* * Note that just like the original cdcheckmedia(), we do * a prevent without failing the whole operation if the * prevent fails. We try, but keep going if it doesn't * work. */ if ((done_ccb->ccb_h.ccb_state & CD_CCB_TYPE_MASK) == CD_CCB_MEDIA_PREVENT) is_prevent = 1; else is_prevent = 0; xpt_release_ccb(done_ccb); if (is_prevent != 0) { if (error == 0) softc->flags |= CD_FLAG_DISC_LOCKED; else softc->flags &= ~CD_FLAG_DISC_LOCKED; softc->state = CD_STATE_MEDIA_SIZE; xpt_schedule(periph, CAM_PRIORITY_NORMAL); } else { if (error == 0) softc->flags &= ~CD_FLAG_DISC_LOCKED; softc->state = CD_STATE_NORMAL; if (bioq_first(&softc->bio_queue) != NULL) xpt_schedule(periph, CAM_PRIORITY_NORMAL); } return; } case CD_CCB_MEDIA_SIZE: { struct scsi_read_capacity_data *rdcap; int error; error = 0; if ((csio->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = cderror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_PRINT); } if (error == ERESTART) return; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); rdcap = (struct scsi_read_capacity_data *)csio->data_ptr; if (error == 0) { softc->params.disksize =scsi_4btoul(rdcap->addr) + 1; softc->params.blksize = scsi_4btoul(rdcap->length); /* Make sure we got at least some block size. */ if (softc->params.blksize == 0) error = EIO; /* * SCSI-3 mandates that the reported blocksize shall be * 2048. Older drives sometimes report funny values, * trim it down to 2048, or other parts of the kernel * will get confused. * * XXX we leave drives alone that might report 512 * bytes, as well as drives reporting more weird * sizes like perhaps 4K. */ if (softc->params.blksize > 2048 && softc->params.blksize <= 2352) softc->params.blksize = 2048; } free(rdcap, M_SCSICD); if (error == 0) { softc->disk->d_sectorsize = softc->params.blksize; softc->disk->d_mediasize = (off_t)softc->params.blksize * softc->params.disksize; softc->flags |= CD_FLAG_SAW_MEDIA | CD_FLAG_VALID_MEDIA; softc->state = CD_STATE_MEDIA_TOC_HDR; } else { softc->flags &= ~(CD_FLAG_VALID_MEDIA | CD_FLAG_VALID_TOC); bioq_flush(&softc->bio_queue, NULL, EINVAL); softc->state = CD_STATE_MEDIA_ALLOW; cdmediaprobedone(periph); } xpt_release_ccb(done_ccb); xpt_schedule(periph, CAM_PRIORITY_NORMAL); return; } case CD_CCB_MEDIA_TOC_HDR: case CD_CCB_MEDIA_TOC_FULL: case CD_CCB_MEDIA_TOC_LEAD: { int error; struct ioc_toc_header *toch; int num_entries; int cdindex; error = 0; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = cderror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_PRINT); } if (error == ERESTART) return; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); /* * We will get errors here for media that doesn't have a table * of contents. According to the MMC-3 spec: "When a Read * TOC/PMA/ATIP command is presented for a DDCD/CD-R/RW media, * where the first TOC has not been recorded (no complete * session) and the Format codes 0000b, 0001b, or 0010b are * specified, this command shall be rejected with an INVALID * FIELD IN CDB. Devices that are not capable of reading an * incomplete session on DDC/CD-R/RW media shall report * CANNOT READ MEDIUM - INCOMPATIBLE FORMAT." * * So this isn't fatal if we can't read the table of contents, * it just means that the user won't be able to issue the * play tracks ioctl, and likely lots of other stuff won't * work either. They need to burn the CD before we can do * a whole lot with it. So we don't print anything here if * we get an error back. * * We also bail out if the drive doesn't at least give us * the full TOC header. */ if ((error != 0) || ((csio->dxfer_len - csio->resid) < sizeof(struct ioc_toc_header))) { softc->flags &= ~CD_FLAG_VALID_TOC; bzero(&softc->toc, sizeof(softc->toc)); /* * Failing the TOC read is not an error. */ softc->state = CD_STATE_NORMAL; xpt_release_ccb(done_ccb); cdmediaprobedone(periph); /* * Go ahead and schedule I/O execution if there is * anything in the queue. It'll probably get * kicked out with an error. */ if (bioq_first(&softc->bio_queue) != NULL) xpt_schedule(periph, CAM_PRIORITY_NORMAL); return; } /* * Note that this is NOT the storage location used for the * leadout! */ toch = &softc->toc.header; if (softc->quirks & CD_Q_BCD_TRACKS) { toch->starting_track = bcd2bin(toch->starting_track); toch->ending_track = bcd2bin(toch->ending_track); } /* Number of TOC entries, plus leadout */ num_entries = toch->ending_track - toch->starting_track + 2; cdindex = toch->starting_track + num_entries - 1; if ((done_ccb->ccb_h.ccb_state & CD_CCB_TYPE_MASK) == CD_CCB_MEDIA_TOC_HDR) { if (num_entries <= 0 || num_entries > nitems(softc->toc.entries)) { softc->flags &= ~CD_FLAG_VALID_TOC; bzero(&softc->toc, sizeof(softc->toc)); /* * Failing the TOC read is not an error. */ softc->state = CD_STATE_NORMAL; xpt_release_ccb(done_ccb); cdmediaprobedone(periph); /* * Go ahead and schedule I/O execution if * there is anything in the queue. It'll * probably get kicked out with an error. */ if (bioq_first(&softc->bio_queue) != NULL) xpt_schedule(periph, CAM_PRIORITY_NORMAL); } else { softc->toc_read_len = num_entries * sizeof(struct cd_toc_entry); softc->toc_read_len += sizeof(*toch); softc->state = CD_STATE_MEDIA_TOC_FULL; xpt_release_ccb(done_ccb); xpt_schedule(periph, CAM_PRIORITY_NORMAL); } return; } else if ((done_ccb->ccb_h.ccb_state & CD_CCB_TYPE_MASK) == CD_CCB_MEDIA_TOC_LEAD) { struct cd_toc_single *leadout; leadout = (struct cd_toc_single *)csio->data_ptr; softc->toc.entries[cdindex - toch->starting_track] = leadout->entry; } else if (((done_ccb->ccb_h.ccb_state & CD_CCB_TYPE_MASK) == CD_CCB_MEDIA_TOC_FULL) && (cdindex == toch->ending_track + 1)) { /* * XXX KDM is this necessary? Probably only if the * drive doesn't return leadout information with the * table of contents. */ softc->state = CD_STATE_MEDIA_TOC_LEAD; xpt_release_ccb(done_ccb); xpt_schedule(periph, CAM_PRIORITY_NORMAL); return; } if (softc->quirks & CD_Q_BCD_TRACKS) { for (cdindex = 0; cdindex < num_entries - 1; cdindex++){ softc->toc.entries[cdindex].track = bcd2bin(softc->toc.entries[cdindex].track); } } softc->flags |= CD_FLAG_VALID_TOC; /* If the first track is audio, correct sector size. */ if ((softc->toc.entries[0].control & 4) == 0) { softc->disk->d_sectorsize =softc->params.blksize = 2352; softc->disk->d_mediasize = (off_t)softc->params.blksize * softc->params.disksize; } softc->state = CD_STATE_NORMAL; /* * We unconditionally (re)set the blocksize each time the * CD device is opened. This is because the CD can change, * and therefore the blocksize might change. * XXX problems here if some slice or partition is still * open with the old size? */ if ((softc->disk->d_devstat->flags & DEVSTAT_BS_UNAVAILABLE)!=0) softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE; softc->disk->d_devstat->block_size = softc->params.blksize; xpt_release_ccb(done_ccb); cdmediaprobedone(periph); if (bioq_first(&softc->bio_queue) != NULL) xpt_schedule(periph, CAM_PRIORITY_NORMAL); return; } default: break; } xpt_release_ccb(done_ccb); } static union cd_pages * cdgetpage(struct cd_mode_params *mode_params) { union cd_pages *page; if (mode_params->cdb_size == 10) page = (union cd_pages *)find_mode_page_10( (struct scsi_mode_header_10 *)mode_params->mode_buf); else page = (union cd_pages *)find_mode_page_6( (struct scsi_mode_header_6 *)mode_params->mode_buf); return (page); } static int cdgetpagesize(int page_num) { u_int i; for (i = 0; i < nitems(cd_page_size_table); i++) { if (cd_page_size_table[i].page == page_num) return (cd_page_size_table[i].page_size); } return (-1); } static struct cd_toc_entry * te_data_get_ptr(void *irtep, u_long cmd) { union { struct ioc_read_toc_entry irte; #ifdef COMPAT_FREEBSD32 struct ioc_read_toc_entry32 irte32; #endif } *irteup; irteup = irtep; switch (IOCPARM_LEN(cmd)) { case sizeof(irteup->irte): return (irteup->irte.data); #ifdef COMPAT_FREEBSD32 case sizeof(irteup->irte32): return ((struct cd_toc_entry *)(uintptr_t)irteup->irte32.data); #endif default: panic("Unhandled ioctl command %ld", cmd); } } static int cdioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) { struct cam_periph *periph; struct cd_softc *softc; int error = 0; periph = (struct cam_periph *)dp->d_drv1; cam_periph_lock(periph); softc = (struct cd_softc *)periph->softc; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("cdioctl(%#lx)\n", cmd)); if ((error = cam_periph_hold(periph, PRIBIO | PCATCH)) != 0) { cam_periph_unlock(periph); cam_periph_release(periph); return (error); } /* * If we don't have media loaded, check for it. If still don't * have media loaded, we can only do a load or eject. * * We only care whether media is loaded if this is a cd-specific ioctl * (thus the IOCGROUP check below). Note that this will break if * anyone adds any ioctls into the switch statement below that don't * have their ioctl group set to 'c'. */ if (((softc->flags & CD_FLAG_VALID_MEDIA) == 0) && ((cmd != CDIOCCLOSE) && (cmd != CDIOCEJECT)) && (IOCGROUP(cmd) == 'c')) { error = cdcheckmedia(periph, /*do_wait*/ 1); if (error != 0) { cam_periph_unhold(periph); cam_periph_unlock(periph); return (error); } } /* * Drop the lock here so later mallocs can use WAITOK. The periph * is essentially locked still with the cam_periph_hold call above. */ cam_periph_unlock(periph); switch (cmd) { case CDIOCPLAYTRACKS: { struct ioc_play_track *args = (struct ioc_play_track *) addr; struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCPLAYTRACKS\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.flags &= ~CD_PA_SOTC; page->audio.flags |= CD_PA_IMMED; error = cdsetmode(periph, ¶ms); free(params.mode_buf, M_SCSICD); if (error) { cam_periph_unlock(periph); break; } /* * This was originally implemented with the PLAY * AUDIO TRACK INDEX command, but that command was * deprecated after SCSI-2. Most (all?) SCSI CDROM * drives support it but ATAPI and ATAPI-derivative * drives don't seem to support it. So we keep a * cache of the table of contents and translate * track numbers to MSF format. */ if (softc->flags & CD_FLAG_VALID_TOC) { union msf_lba *sentry, *eentry; struct ioc_toc_header *th; int st, et; th = &softc->toc.header; if (args->end_track < th->ending_track + 1) args->end_track++; if (args->end_track > th->ending_track + 1) args->end_track = th->ending_track + 1; st = args->start_track - th->starting_track; et = args->end_track - th->starting_track; if (st < 0 || et < 0 || st > th->ending_track - th->starting_track || et > th->ending_track - th->starting_track) { error = EINVAL; cam_periph_unlock(periph); break; } sentry = &softc->toc.entries[st].addr; eentry = &softc->toc.entries[et].addr; error = cdplaymsf(periph, sentry->msf.minute, sentry->msf.second, sentry->msf.frame, eentry->msf.minute, eentry->msf.second, eentry->msf.frame); } else { /* * If we don't have a valid TOC, try the * play track index command. It is part of * the SCSI-2 spec, but was removed in the * MMC specs. ATAPI and ATAPI-derived * drives don't support it. */ if (softc->quirks & CD_Q_BCD_TRACKS) { args->start_track = bin2bcd(args->start_track); args->end_track = bin2bcd(args->end_track); } error = cdplaytracks(periph, args->start_track, args->start_index, args->end_track, args->end_index); } cam_periph_unlock(periph); } break; case CDIOCPLAYMSF: { struct ioc_play_msf *args = (struct ioc_play_msf *) addr; struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCPLAYMSF\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.flags &= ~CD_PA_SOTC; page->audio.flags |= CD_PA_IMMED; error = cdsetmode(periph, ¶ms); free(params.mode_buf, M_SCSICD); if (error) { cam_periph_unlock(periph); break; } error = cdplaymsf(periph, args->start_m, args->start_s, args->start_f, args->end_m, args->end_s, args->end_f); cam_periph_unlock(periph); } break; case CDIOCPLAYBLOCKS: { struct ioc_play_blocks *args = (struct ioc_play_blocks *) addr; struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCPLAYBLOCKS\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.flags &= ~CD_PA_SOTC; page->audio.flags |= CD_PA_IMMED; error = cdsetmode(periph, ¶ms); free(params.mode_buf, M_SCSICD); if (error) { cam_periph_unlock(periph); break; } error = cdplay(periph, args->blk, args->len); cam_periph_unlock(periph); } break; case CDIOCREADSUBCHANNEL: { struct ioc_read_subchannel *args = (struct ioc_read_subchannel *) addr; struct cd_sub_channel_info *data; u_int32_t len = args->data_len; data = malloc(sizeof(struct cd_sub_channel_info), M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCREADSUBCHANNEL\n")); if ((len > sizeof(struct cd_sub_channel_info)) || (len < sizeof(struct cd_sub_channel_header))) { printf( "scsi_cd: cdioctl: " "cdioreadsubchannel: error, len=%d\n", len); error = EINVAL; free(data, M_SCSICD); cam_periph_unlock(periph); break; } if (softc->quirks & CD_Q_BCD_TRACKS) args->track = bin2bcd(args->track); error = cdreadsubchannel(periph, args->address_format, args->data_format, args->track, data, len); if (error) { free(data, M_SCSICD); cam_periph_unlock(periph); break; } if (softc->quirks & CD_Q_BCD_TRACKS) data->what.track_info.track_number = bcd2bin(data->what.track_info.track_number); len = min(len, ((data->header.data_len[0] << 8) + data->header.data_len[1] + sizeof(struct cd_sub_channel_header))); cam_periph_unlock(periph); error = copyout(data, args->data, len); free(data, M_SCSICD); } break; case CDIOREADTOCHEADER: { struct ioc_toc_header *th; th = malloc(sizeof(struct ioc_toc_header), M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOREADTOCHEADER\n")); error = cdreadtoc(periph, 0, 0, (u_int8_t *)th, sizeof (*th), /*sense_flags*/SF_NO_PRINT); if (error) { free(th, M_SCSICD); cam_periph_unlock(periph); break; } if (softc->quirks & CD_Q_BCD_TRACKS) { /* we are going to have to convert the BCD * encoding on the cd to what is expected */ th->starting_track = bcd2bin(th->starting_track); th->ending_track = bcd2bin(th->ending_track); } th->len = ntohs(th->len); bcopy(th, addr, sizeof(*th)); free(th, M_SCSICD); cam_periph_unlock(periph); } break; case CDIOREADTOCENTRYS: #ifdef COMPAT_FREEBSD32 case CDIOREADTOCENTRYS_32: #endif { struct cd_tocdata *data; struct cd_toc_single *lead; struct ioc_read_toc_entry *te = (struct ioc_read_toc_entry *) addr; struct ioc_toc_header *th; u_int32_t len, readlen, idx, num; u_int32_t starting_track = te->starting_track; data = malloc(sizeof(*data), M_SCSICD, M_WAITOK | M_ZERO); lead = malloc(sizeof(*lead), M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOREADTOCENTRYS\n")); if (te->data_len < sizeof(struct cd_toc_entry) || (te->data_len % sizeof(struct cd_toc_entry)) != 0 || (te->address_format != CD_MSF_FORMAT && te->address_format != CD_LBA_FORMAT)) { error = EINVAL; printf("scsi_cd: error in readtocentries, " "returning EINVAL\n"); free(data, M_SCSICD); free(lead, M_SCSICD); cam_periph_unlock(periph); break; } th = &data->header; error = cdreadtoc(periph, 0, 0, (u_int8_t *)th, sizeof (*th), /*sense_flags*/0); if (error) { free(data, M_SCSICD); free(lead, M_SCSICD); cam_periph_unlock(periph); break; } if (softc->quirks & CD_Q_BCD_TRACKS) { /* we are going to have to convert the BCD * encoding on the cd to what is expected */ th->starting_track = bcd2bin(th->starting_track); th->ending_track = bcd2bin(th->ending_track); } if (starting_track == 0) starting_track = th->starting_track; else if (starting_track == LEADOUT) starting_track = th->ending_track + 1; else if (starting_track < th->starting_track || starting_track > th->ending_track + 1) { printf("scsi_cd: error in readtocentries, " "returning EINVAL\n"); free(data, M_SCSICD); free(lead, M_SCSICD); cam_periph_unlock(periph); error = EINVAL; break; } /* calculate reading length without leadout entry */ readlen = (th->ending_track - starting_track + 1) * sizeof(struct cd_toc_entry); /* and with leadout entry */ len = readlen + sizeof(struct cd_toc_entry); if (te->data_len < len) { len = te->data_len; if (readlen > len) readlen = len; } if (len > sizeof(data->entries)) { printf("scsi_cd: error in readtocentries, " "returning EINVAL\n"); error = EINVAL; free(data, M_SCSICD); free(lead, M_SCSICD); cam_periph_unlock(periph); break; } num = len / sizeof(struct cd_toc_entry); if (readlen > 0) { error = cdreadtoc(periph, te->address_format, starting_track, (u_int8_t *)data, readlen + sizeof (*th), /*sense_flags*/0); if (error) { free(data, M_SCSICD); free(lead, M_SCSICD); cam_periph_unlock(periph); break; } } /* make leadout entry if needed */ idx = starting_track + num - 1; if (softc->quirks & CD_Q_BCD_TRACKS) th->ending_track = bcd2bin(th->ending_track); if (idx == th->ending_track + 1) { error = cdreadtoc(periph, te->address_format, LEADOUT, (u_int8_t *)lead, sizeof(*lead), /*sense_flags*/0); if (error) { free(data, M_SCSICD); free(lead, M_SCSICD); cam_periph_unlock(periph); break; } data->entries[idx - starting_track] = lead->entry; } if (softc->quirks & CD_Q_BCD_TRACKS) { for (idx = 0; idx < num - 1; idx++) { data->entries[idx].track = bcd2bin(data->entries[idx].track); } } cam_periph_unlock(periph); error = copyout(data->entries, te_data_get_ptr(te, cmd), len); free(data, M_SCSICD); free(lead, M_SCSICD); } break; case CDIOREADTOCENTRY: { struct cd_toc_single *data; struct ioc_read_toc_single_entry *te = (struct ioc_read_toc_single_entry *) addr; struct ioc_toc_header *th; u_int32_t track; data = malloc(sizeof(*data), M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOREADTOCENTRY\n")); if (te->address_format != CD_MSF_FORMAT && te->address_format != CD_LBA_FORMAT) { printf("error in readtocentry, " " returning EINVAL\n"); free(data, M_SCSICD); error = EINVAL; cam_periph_unlock(periph); break; } th = &data->header; error = cdreadtoc(periph, 0, 0, (u_int8_t *)th, sizeof (*th), /*sense_flags*/0); if (error) { free(data, M_SCSICD); cam_periph_unlock(periph); break; } if (softc->quirks & CD_Q_BCD_TRACKS) { /* we are going to have to convert the BCD * encoding on the cd to what is expected */ th->starting_track = bcd2bin(th->starting_track); th->ending_track = bcd2bin(th->ending_track); } track = te->track; if (track == 0) track = th->starting_track; else if (track == LEADOUT) /* OK */; else if (track < th->starting_track || track > th->ending_track + 1) { printf("error in readtocentry, " " returning EINVAL\n"); free(data, M_SCSICD); error = EINVAL; cam_periph_unlock(periph); break; } error = cdreadtoc(periph, te->address_format, track, (u_int8_t *)data, sizeof(*data), /*sense_flags*/0); if (error) { free(data, M_SCSICD); cam_periph_unlock(periph); break; } if (softc->quirks & CD_Q_BCD_TRACKS) data->entry.track = bcd2bin(data->entry.track); bcopy(&data->entry, &te->entry, sizeof(struct cd_toc_entry)); free(data, M_SCSICD); cam_periph_unlock(periph); } break; case CDIOCSETPATCH: { struct ioc_patch *arg = (struct ioc_patch *)addr; struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCSETPATCH\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.port[LEFT_PORT].channels = arg->patch[0]; page->audio.port[RIGHT_PORT].channels = arg->patch[1]; page->audio.port[2].channels = arg->patch[2]; page->audio.port[3].channels = arg->patch[3]; error = cdsetmode(periph, ¶ms); free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); } break; case CDIOCGETVOL: { struct ioc_vol *arg = (struct ioc_vol *) addr; struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCGETVOL\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); arg->vol[LEFT_PORT] = page->audio.port[LEFT_PORT].volume; arg->vol[RIGHT_PORT] = page->audio.port[RIGHT_PORT].volume; arg->vol[2] = page->audio.port[2].volume; arg->vol[3] = page->audio.port[3].volume; free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); } break; case CDIOCSETVOL: { struct ioc_vol *arg = (struct ioc_vol *) addr; struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCSETVOL\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.port[LEFT_PORT].channels = CHANNEL_0; page->audio.port[LEFT_PORT].volume = arg->vol[LEFT_PORT]; page->audio.port[RIGHT_PORT].channels = CHANNEL_1; page->audio.port[RIGHT_PORT].volume = arg->vol[RIGHT_PORT]; page->audio.port[2].volume = arg->vol[2]; page->audio.port[3].volume = arg->vol[3]; error = cdsetmode(periph, ¶ms); cam_periph_unlock(periph); free(params.mode_buf, M_SCSICD); } break; case CDIOCSETMONO: { struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCSETMONO\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.port[LEFT_PORT].channels = LEFT_CHANNEL | RIGHT_CHANNEL; page->audio.port[RIGHT_PORT].channels = LEFT_CHANNEL | RIGHT_CHANNEL; page->audio.port[2].channels = 0; page->audio.port[3].channels = 0; error = cdsetmode(periph, ¶ms); cam_periph_unlock(periph); free(params.mode_buf, M_SCSICD); } break; case CDIOCSETSTEREO: { struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCSETSTEREO\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.port[LEFT_PORT].channels = LEFT_CHANNEL; page->audio.port[RIGHT_PORT].channels = RIGHT_CHANNEL; page->audio.port[2].channels = 0; page->audio.port[3].channels = 0; error = cdsetmode(periph, ¶ms); free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); } break; case CDIOCSETMUTE: { struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCSETMUTE\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.port[LEFT_PORT].channels = 0; page->audio.port[RIGHT_PORT].channels = 0; page->audio.port[2].channels = 0; page->audio.port[3].channels = 0; error = cdsetmode(periph, ¶ms); free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); } break; case CDIOCSETLEFT: { struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCSETLEFT\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.port[LEFT_PORT].channels = LEFT_CHANNEL; page->audio.port[RIGHT_PORT].channels = LEFT_CHANNEL; page->audio.port[2].channels = 0; page->audio.port[3].channels = 0; error = cdsetmode(periph, ¶ms); free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); } break; case CDIOCSETRIGHT: { struct cd_mode_params params; union cd_pages *page; params.alloc_len = sizeof(union cd_mode_data_6_10); params.mode_buf = malloc(params.alloc_len, M_SCSICD, M_WAITOK | M_ZERO); cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("trying to do CDIOCSETRIGHT\n")); error = cdgetmode(periph, ¶ms, AUDIO_PAGE); if (error) { free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); break; } page = cdgetpage(¶ms); page->audio.port[LEFT_PORT].channels = RIGHT_CHANNEL; page->audio.port[RIGHT_PORT].channels = RIGHT_CHANNEL; page->audio.port[2].channels = 0; page->audio.port[3].channels = 0; error = cdsetmode(periph, ¶ms); free(params.mode_buf, M_SCSICD); cam_periph_unlock(periph); } break; case CDIOCRESUME: cam_periph_lock(periph); error = cdpause(periph, 1); cam_periph_unlock(periph); break; case CDIOCPAUSE: cam_periph_lock(periph); error = cdpause(periph, 0); cam_periph_unlock(periph); break; case CDIOCSTART: cam_periph_lock(periph); error = cdstartunit(periph, 0); cam_periph_unlock(periph); break; case CDIOCCLOSE: cam_periph_lock(periph); error = cdstartunit(periph, 1); cam_periph_unlock(periph); break; case CDIOCSTOP: cam_periph_lock(periph); error = cdstopunit(periph, 0); cam_periph_unlock(periph); break; case CDIOCEJECT: cam_periph_lock(periph); error = cdstopunit(periph, 1); cam_periph_unlock(periph); break; case CDIOCALLOW: cam_periph_lock(periph); cdprevent(periph, PR_ALLOW); cam_periph_unlock(periph); break; case CDIOCPREVENT: cam_periph_lock(periph); cdprevent(periph, PR_PREVENT); cam_periph_unlock(periph); break; case CDIOCSETDEBUG: /* sc_link->flags |= (SDEV_DB1 | SDEV_DB2); */ error = ENOTTY; break; case CDIOCCLRDEBUG: /* sc_link->flags &= ~(SDEV_DB1 | SDEV_DB2); */ error = ENOTTY; break; case CDIOCRESET: /* return (cd_reset(periph)); */ error = ENOTTY; break; case CDRIOCREADSPEED: cam_periph_lock(periph); error = cdsetspeed(periph, *(u_int32_t *)addr, CDR_MAX_SPEED); cam_periph_unlock(periph); break; case CDRIOCWRITESPEED: cam_periph_lock(periph); error = cdsetspeed(periph, CDR_MAX_SPEED, *(u_int32_t *)addr); cam_periph_unlock(periph); break; case CDRIOCGETBLOCKSIZE: *(int *)addr = softc->params.blksize; break; case CDRIOCSETBLOCKSIZE: if (*(int *)addr <= 0) { error = EINVAL; break; } softc->disk->d_sectorsize = softc->params.blksize = *(int *)addr; break; case DVDIOCSENDKEY: case DVDIOCREPORTKEY: { struct dvd_authinfo *authinfo; authinfo = (struct dvd_authinfo *)addr; if (cmd == DVDIOCREPORTKEY) error = cdreportkey(periph, authinfo); else error = cdsendkey(periph, authinfo); break; } case DVDIOCREADSTRUCTURE: { struct dvd_struct *dvdstruct; dvdstruct = (struct dvd_struct *)addr; error = cdreaddvdstructure(periph, dvdstruct); break; } default: cam_periph_lock(periph); error = cam_periph_ioctl(periph, cmd, addr, cderror); cam_periph_unlock(periph); break; } cam_periph_lock(periph); cam_periph_unhold(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("leaving cdioctl\n")); if (error && bootverbose) { printf("scsi_cd.c::ioctl cmd=%08lx error=%d\n", cmd, error); } cam_periph_unlock(periph); return (error); } static void cdprevent(struct cam_periph *periph, int action) { union ccb *ccb; struct cd_softc *softc; int error; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering cdprevent\n")); cam_periph_assert(periph, MA_OWNED); softc = (struct cd_softc *)periph->softc; if (((action == PR_ALLOW) && (softc->flags & CD_FLAG_DISC_LOCKED) == 0) || ((action == PR_PREVENT) && (softc->flags & CD_FLAG_DISC_LOCKED) != 0)) { return; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_prevent(&ccb->csio, /*retries*/ cd_retry_count, /*cbfcnp*/NULL, MSG_SIMPLE_Q_TAG, action, SSD_FULL_SIZE, /* timeout */60000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA|SF_NO_PRINT); xpt_release_ccb(ccb); if (error == 0) { if (action == PR_ALLOW) softc->flags &= ~CD_FLAG_DISC_LOCKED; else softc->flags |= CD_FLAG_DISC_LOCKED; } } static void cdmediaprobedone(struct cam_periph *periph) { struct cd_softc *softc; cam_periph_assert(periph, MA_OWNED); softc = (struct cd_softc *)periph->softc; softc->flags &= ~CD_FLAG_MEDIA_SCAN_ACT; if ((softc->flags & CD_FLAG_MEDIA_WAIT) != 0) { softc->flags &= ~CD_FLAG_MEDIA_WAIT; wakeup(&softc->toc); } } /* * XXX: the disk media and sector size is only really able to change * XXX: while the device is closed. */ static int cdcheckmedia(struct cam_periph *periph, int do_wait) { struct cd_softc *softc; int error; cam_periph_assert(periph, MA_OWNED); softc = (struct cd_softc *)periph->softc; error = 0; if ((do_wait != 0) && ((softc->flags & CD_FLAG_MEDIA_WAIT) == 0)) { softc->flags |= CD_FLAG_MEDIA_WAIT; } if ((softc->flags & CD_FLAG_MEDIA_SCAN_ACT) == 0) { softc->state = CD_STATE_MEDIA_PREVENT; softc->flags |= CD_FLAG_MEDIA_SCAN_ACT; xpt_schedule(periph, CAM_PRIORITY_NORMAL); } if (do_wait == 0) goto bailout; error = msleep(&softc->toc, cam_periph_mtx(periph), PRIBIO,"cdmedia",0); if (error != 0) goto bailout; /* * Check to see whether we have a valid size from the media. We * may or may not have a valid TOC. */ if ((softc->flags & CD_FLAG_VALID_MEDIA) == 0) error = EINVAL; bailout: return (error); } #if 0 static int cdcheckmedia(struct cam_periph *periph) { struct cd_softc *softc; struct ioc_toc_header *toch; struct cd_toc_single leadout; u_int32_t size, toclen; int error, num_entries, cdindex; softc = (struct cd_softc *)periph->softc; cdprevent(periph, PR_PREVENT); softc->disk->d_sectorsize = 2048; softc->disk->d_mediasize = 0; /* * Get the disc size and block size. If we can't get it, we don't * have media, most likely. */ if ((error = cdsize(periph, &size)) != 0) { softc->flags &= ~(CD_FLAG_VALID_MEDIA|CD_FLAG_VALID_TOC); cdprevent(periph, PR_ALLOW); return (error); } else { softc->flags |= CD_FLAG_SAW_MEDIA | CD_FLAG_VALID_MEDIA; softc->disk->d_sectorsize = softc->params.blksize; softc->disk->d_mediasize = (off_t)softc->params.blksize * softc->params.disksize; } /* * Now we check the table of contents. This (currently) is only * used for the CDIOCPLAYTRACKS ioctl. It may be used later to do * things like present a separate entry in /dev for each track, * like that acd(4) driver does. */ bzero(&softc->toc, sizeof(softc->toc)); toch = &softc->toc.header; /* * We will get errors here for media that doesn't have a table of * contents. According to the MMC-3 spec: "When a Read TOC/PMA/ATIP * command is presented for a DDCD/CD-R/RW media, where the first TOC * has not been recorded (no complete session) and the Format codes * 0000b, 0001b, or 0010b are specified, this command shall be rejected * with an INVALID FIELD IN CDB. Devices that are not capable of * reading an incomplete session on DDC/CD-R/RW media shall report * CANNOT READ MEDIUM - INCOMPATIBLE FORMAT." * * So this isn't fatal if we can't read the table of contents, it * just means that the user won't be able to issue the play tracks * ioctl, and likely lots of other stuff won't work either. They * need to burn the CD before we can do a whole lot with it. So * we don't print anything here if we get an error back. */ error = cdreadtoc(periph, 0, 0, (u_int8_t *)toch, sizeof(*toch), SF_NO_PRINT); /* * Errors in reading the table of contents aren't fatal, we just * won't have a valid table of contents cached. */ if (error != 0) { error = 0; bzero(&softc->toc, sizeof(softc->toc)); goto bailout; } if (softc->quirks & CD_Q_BCD_TRACKS) { toch->starting_track = bcd2bin(toch->starting_track); toch->ending_track = bcd2bin(toch->ending_track); } /* Number of TOC entries, plus leadout */ num_entries = (toch->ending_track - toch->starting_track) + 2; if (num_entries <= 0) goto bailout; toclen = num_entries * sizeof(struct cd_toc_entry); error = cdreadtoc(periph, CD_MSF_FORMAT, toch->starting_track, (u_int8_t *)&softc->toc, toclen + sizeof(*toch), SF_NO_PRINT); if (error != 0) { error = 0; bzero(&softc->toc, sizeof(softc->toc)); goto bailout; } if (softc->quirks & CD_Q_BCD_TRACKS) { toch->starting_track = bcd2bin(toch->starting_track); toch->ending_track = bcd2bin(toch->ending_track); } /* * XXX KDM is this necessary? Probably only if the drive doesn't * return leadout information with the table of contents. */ cdindex = toch->starting_track + num_entries -1; if (cdindex == toch->ending_track + 1) { error = cdreadtoc(periph, CD_MSF_FORMAT, LEADOUT, (u_int8_t *)&leadout, sizeof(leadout), SF_NO_PRINT); if (error != 0) { error = 0; goto bailout; } softc->toc.entries[cdindex - toch->starting_track] = leadout.entry; } if (softc->quirks & CD_Q_BCD_TRACKS) { for (cdindex = 0; cdindex < num_entries - 1; cdindex++) { softc->toc.entries[cdindex].track = bcd2bin(softc->toc.entries[cdindex].track); } } softc->flags |= CD_FLAG_VALID_TOC; /* If the first track is audio, correct sector size. */ if ((softc->toc.entries[0].control & 4) == 0) { softc->disk->d_sectorsize = softc->params.blksize = 2352; softc->disk->d_mediasize = (off_t)softc->params.blksize * softc->params.disksize; } bailout: /* * We unconditionally (re)set the blocksize each time the * CD device is opened. This is because the CD can change, * and therefore the blocksize might change. * XXX problems here if some slice or partition is still * open with the old size? */ if ((softc->disk->d_devstat->flags & DEVSTAT_BS_UNAVAILABLE) != 0) softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE; softc->disk->d_devstat->block_size = softc->params.blksize; return (error); } static int cdsize(struct cam_periph *periph, u_int32_t *size) { struct cd_softc *softc; union ccb *ccb; struct scsi_read_capacity_data *rcap_buf; int error; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering cdsize\n")); softc = (struct cd_softc *)periph->softc; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); /* XXX Should be M_WAITOK */ rcap_buf = malloc(sizeof(struct scsi_read_capacity_data), M_SCSICD, M_NOWAIT | M_ZERO); if (rcap_buf == NULL) return (ENOMEM); scsi_read_capacity(&ccb->csio, /*retries*/ cd_retry_count, /*cbfcnp*/NULL, MSG_SIMPLE_Q_TAG, rcap_buf, SSD_FULL_SIZE, /* timeout */20000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA|SF_NO_PRINT); xpt_release_ccb(ccb); softc->params.disksize = scsi_4btoul(rcap_buf->addr) + 1; softc->params.blksize = scsi_4btoul(rcap_buf->length); /* Make sure we got at least some block size. */ if (error == 0 && softc->params.blksize == 0) error = EIO; /* * SCSI-3 mandates that the reported blocksize shall be 2048. * Older drives sometimes report funny values, trim it down to * 2048, or other parts of the kernel will get confused. * * XXX we leave drives alone that might report 512 bytes, as * well as drives reporting more weird sizes like perhaps 4K. */ if (softc->params.blksize > 2048 && softc->params.blksize <= 2352) softc->params.blksize = 2048; free(rcap_buf, M_SCSICD); *size = softc->params.disksize; return (error); } #endif static int cd6byteworkaround(union ccb *ccb) { u_int8_t *cdb; struct cam_periph *periph; struct cd_softc *softc; struct cd_mode_params *params; int frozen, found; periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct cd_softc *)periph->softc; cdb = ccb->csio.cdb_io.cdb_bytes; if ((ccb->ccb_h.flags & CAM_CDB_POINTER) || ((cdb[0] != MODE_SENSE_6) && (cdb[0] != MODE_SELECT_6))) return (0); /* * Because there is no convenient place to stash the overall * cd_mode_params structure pointer, we have to grab it like this. * This means that ALL MODE_SENSE and MODE_SELECT requests in the * cd(4) driver MUST go through cdgetmode() and cdsetmode()! * * XXX It would be nice if, at some point, we could increase the * number of available peripheral private pointers. Both pointers * are currently used in most every peripheral driver. */ found = 0; STAILQ_FOREACH(params, &softc->mode_queue, links) { if (params->mode_buf == ccb->csio.data_ptr) { found = 1; break; } } /* * This shouldn't happen. All mode sense and mode select * operations in the cd(4) driver MUST go through cdgetmode() and * cdsetmode()! */ if (found == 0) { xpt_print(periph->path, "mode buffer not found in mode queue!\n"); return (0); } params->cdb_size = 10; softc->minimum_command_size = 10; xpt_print(ccb->ccb_h.path, "%s(6) failed, increasing minimum CDB size to 10 bytes\n", (cdb[0] == MODE_SENSE_6) ? "MODE_SENSE" : "MODE_SELECT"); if (cdb[0] == MODE_SENSE_6) { struct scsi_mode_sense_10 ms10; struct scsi_mode_sense_6 *ms6; int len; ms6 = (struct scsi_mode_sense_6 *)cdb; bzero(&ms10, sizeof(ms10)); ms10.opcode = MODE_SENSE_10; ms10.byte2 = ms6->byte2; ms10.page = ms6->page; /* * 10 byte mode header, block descriptor, * sizeof(union cd_pages) */ len = sizeof(struct cd_mode_data_10); ccb->csio.dxfer_len = len; scsi_ulto2b(len, ms10.length); ms10.control = ms6->control; bcopy(&ms10, cdb, 10); ccb->csio.cdb_len = 10; } else { struct scsi_mode_select_10 ms10; struct scsi_mode_select_6 *ms6; struct scsi_mode_header_6 *header6; struct scsi_mode_header_10 *header10; struct scsi_mode_page_header *page_header; int blk_desc_len, page_num, page_size, len; ms6 = (struct scsi_mode_select_6 *)cdb; bzero(&ms10, sizeof(ms10)); ms10.opcode = MODE_SELECT_10; ms10.byte2 = ms6->byte2; header6 = (struct scsi_mode_header_6 *)params->mode_buf; header10 = (struct scsi_mode_header_10 *)params->mode_buf; page_header = find_mode_page_6(header6); page_num = page_header->page_code; blk_desc_len = header6->blk_desc_len; page_size = cdgetpagesize(page_num); if (page_size != (page_header->page_length + sizeof(*page_header))) page_size = page_header->page_length + sizeof(*page_header); len = sizeof(*header10) + blk_desc_len + page_size; len = min(params->alloc_len, len); /* * Since the 6 byte parameter header is shorter than the 10 * byte parameter header, we need to copy the actual mode * page data, and the block descriptor, if any, so things wind * up in the right place. The regions will overlap, but * bcopy() does the right thing. */ bcopy(params->mode_buf + sizeof(*header6), params->mode_buf + sizeof(*header10), len - sizeof(*header10)); /* Make sure these fields are set correctly. */ scsi_ulto2b(0, header10->data_length); header10->medium_type = 0; scsi_ulto2b(blk_desc_len, header10->blk_desc_len); ccb->csio.dxfer_len = len; scsi_ulto2b(len, ms10.length); ms10.control = ms6->control; bcopy(&ms10, cdb, 10); ccb->csio.cdb_len = 10; } frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0; ccb->ccb_h.status = CAM_REQUEUE_REQ; xpt_action(ccb); if (frozen) { cam_release_devq(ccb->ccb_h.path, /*relsim_flags*/0, /*openings*/0, /*timeout*/0, /*getcount_only*/0); } return (ERESTART); } static int cderror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { struct cd_softc *softc; struct cam_periph *periph; int error, error_code, sense_key, asc, ascq; periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct cd_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); /* * We use a status of CAM_REQ_INVALID as shorthand -- if a 6 byte * CDB comes back with this particular error, try transforming it * into the 10 byte version. */ error = 0; if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) { error = cd6byteworkaround(ccb); } else if (scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq)) { if (sense_key == SSD_KEY_ILLEGAL_REQUEST) error = cd6byteworkaround(ccb); else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x28 && ascq == 0x00) disk_media_changed(softc->disk, M_NOWAIT); else if (sense_key == SSD_KEY_NOT_READY && asc == 0x3a && (softc->flags & CD_FLAG_SAW_MEDIA)) { softc->flags &= ~CD_FLAG_SAW_MEDIA; disk_media_gone(softc->disk, M_NOWAIT); } } if (error == ERESTART) return (error); /* * XXX * Until we have a better way of doing pack validation, * don't treat UAs as errors. */ sense_flags |= SF_RETRY_UA; if (softc->quirks & CD_Q_RETRY_BUSY) sense_flags |= SF_RETRY_BUSY; return (cam_periph_error(ccb, cam_flags, sense_flags)); } static void cdmediapoll(void *arg) { struct cam_periph *periph = arg; struct cd_softc *softc = periph->softc; if (softc->state == CD_STATE_NORMAL && !softc->tur && softc->outstanding_cmds == 0) { if (cam_periph_acquire(periph) == 0) { softc->tur = 1; xpt_schedule(periph, CAM_PRIORITY_NORMAL); } } /* Queue us up again */ if (cd_poll_period != 0) { callout_schedule_sbt(&softc->mediapoll_c, cd_poll_period * SBT_1S, 0, C_PREL(1)); } } /* * Read table of contents */ static int cdreadtoc(struct cam_periph *periph, u_int32_t mode, u_int32_t start, u_int8_t *data, u_int32_t len, u_int32_t sense_flags) { struct ccb_scsiio *csio; union ccb *ccb; int error; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; scsi_read_toc(csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* byte1_flags */ (mode == CD_MSF_FORMAT) ? CD_MSF : 0, /* format */ SRTOC_FORMAT_TOC, /* track*/ start, /* data_ptr */ data, /* dxfer_len */ len, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 50000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA | sense_flags); xpt_release_ccb(ccb); return(error); } static int cdreadsubchannel(struct cam_periph *periph, u_int32_t mode, u_int32_t format, int track, struct cd_sub_channel_info *data, u_int32_t len) { struct scsi_read_subchannel *scsi_cmd; struct ccb_scsiio *csio; union ccb *ccb; int error; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; cam_fill_csio(csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* flags */ CAM_DIR_IN, /* tag_action */ MSG_SIMPLE_Q_TAG, /* data_ptr */ (u_int8_t *)data, /* dxfer_len */ len, /* sense_len */ SSD_FULL_SIZE, sizeof(struct scsi_read_subchannel), /* timeout */ 50000); scsi_cmd = (struct scsi_read_subchannel *)&csio->cdb_io.cdb_bytes; bzero (scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->op_code = READ_SUBCHANNEL; if (mode == CD_MSF_FORMAT) scsi_cmd->byte1 |= CD_MSF; scsi_cmd->byte2 = SRS_SUBQ; scsi_cmd->subchan_format = format; scsi_cmd->track = track; scsi_ulto2b(len, (u_int8_t *)scsi_cmd->data_len); scsi_cmd->control = 0; error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); return(error); } /* * All MODE_SENSE requests in the cd(4) driver MUST go through this * routine. See comments in cd6byteworkaround() for details. */ static int cdgetmode(struct cam_periph *periph, struct cd_mode_params *data, u_int32_t page) { struct ccb_scsiio *csio; struct cd_softc *softc; union ccb *ccb; int param_len; int error; softc = (struct cd_softc *)periph->softc; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; data->cdb_size = softc->minimum_command_size; if (data->cdb_size < 10) param_len = sizeof(struct cd_mode_data); else param_len = sizeof(struct cd_mode_data_10); /* Don't say we've got more room than we actually allocated */ param_len = min(param_len, data->alloc_len); scsi_mode_sense_len(csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* dbd */ 0, /* page_code */ SMS_PAGE_CTRL_CURRENT, /* page */ page, /* param_buf */ data->mode_buf, /* param_len */ param_len, /* minimum_cmd_size */ softc->minimum_command_size, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 50000); /* * It would be nice not to have to do this, but there's no * available pointer in the CCB that would allow us to stuff the * mode params structure in there and retrieve it in * cd6byteworkaround(), so we can set the cdb size. The cdb size * lets the caller know what CDB size we ended up using, so they * can find the actual mode page offset. */ STAILQ_INSERT_TAIL(&softc->mode_queue, data, links); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); STAILQ_REMOVE(&softc->mode_queue, data, cd_mode_params, links); /* * This is a bit of belt-and-suspenders checking, but if we run * into a situation where the target sends back multiple block * descriptors, we might not have enough space in the buffer to * see the whole mode page. Better to return an error than * potentially access memory beyond our malloced region. */ if (error == 0) { u_int32_t data_len; if (data->cdb_size == 10) { struct scsi_mode_header_10 *hdr10; hdr10 = (struct scsi_mode_header_10 *)data->mode_buf; data_len = scsi_2btoul(hdr10->data_length); data_len += sizeof(hdr10->data_length); } else { struct scsi_mode_header_6 *hdr6; hdr6 = (struct scsi_mode_header_6 *)data->mode_buf; data_len = hdr6->data_length; data_len += sizeof(hdr6->data_length); } /* * Complain if there is more mode data available than we * allocated space for. This could potentially happen if * we miscalculated the page length for some reason, if the * drive returns multiple block descriptors, or if it sets * the data length incorrectly. */ if (data_len > data->alloc_len) { xpt_print(periph->path, "allocated modepage %d length " "%d < returned length %d\n", page, data->alloc_len, data_len); error = ENOSPC; } } return (error); } /* * All MODE_SELECT requests in the cd(4) driver MUST go through this * routine. See comments in cd6byteworkaround() for details. */ static int cdsetmode(struct cam_periph *periph, struct cd_mode_params *data) { struct ccb_scsiio *csio; struct cd_softc *softc; union ccb *ccb; int cdb_size, param_len; int error; softc = (struct cd_softc *)periph->softc; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; error = 0; /* * If the data is formatted for the 10 byte version of the mode * select parameter list, we need to use the 10 byte CDB. * Otherwise, we use whatever the stored minimum command size. */ if (data->cdb_size == 10) cdb_size = data->cdb_size; else cdb_size = softc->minimum_command_size; if (cdb_size >= 10) { struct scsi_mode_header_10 *mode_header; u_int32_t data_len; mode_header = (struct scsi_mode_header_10 *)data->mode_buf; data_len = scsi_2btoul(mode_header->data_length); scsi_ulto2b(0, mode_header->data_length); /* * SONY drives do not allow a mode select with a medium_type * value that has just been returned by a mode sense; use a * medium_type of 0 (Default) instead. */ mode_header->medium_type = 0; /* * Pass back whatever the drive passed to us, plus the size * of the data length field. */ param_len = data_len + sizeof(mode_header->data_length); } else { struct scsi_mode_header_6 *mode_header; mode_header = (struct scsi_mode_header_6 *)data->mode_buf; param_len = mode_header->data_length + 1; mode_header->data_length = 0; /* * SONY drives do not allow a mode select with a medium_type * value that has just been returned by a mode sense; use a * medium_type of 0 (Default) instead. */ mode_header->medium_type = 0; } /* Don't say we've got more room than we actually allocated */ param_len = min(param_len, data->alloc_len); scsi_mode_select_len(csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* scsi_page_fmt */ 1, /* save_pages */ 0, /* param_buf */ data->mode_buf, /* param_len */ param_len, /* minimum_cmd_size */ cdb_size, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 50000); /* See comments in cdgetmode() and cd6byteworkaround(). */ STAILQ_INSERT_TAIL(&softc->mode_queue, data, links); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); STAILQ_REMOVE(&softc->mode_queue, data, cd_mode_params, links); return (error); } static int cdplay(struct cam_periph *periph, u_int32_t blk, u_int32_t len) { struct ccb_scsiio *csio; union ccb *ccb; int error; u_int8_t cdb_len; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; /* * Use the smallest possible command to perform the operation. */ if ((len & 0xffff0000) == 0) { /* * We can fit in a 10 byte cdb. */ struct scsi_play_10 *scsi_cmd; scsi_cmd = (struct scsi_play_10 *)&csio->cdb_io.cdb_bytes; bzero (scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->op_code = PLAY_10; scsi_ulto4b(blk, (u_int8_t *)scsi_cmd->blk_addr); scsi_ulto2b(len, (u_int8_t *)scsi_cmd->xfer_len); cdb_len = sizeof(*scsi_cmd); } else { struct scsi_play_12 *scsi_cmd; scsi_cmd = (struct scsi_play_12 *)&csio->cdb_io.cdb_bytes; bzero (scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->op_code = PLAY_12; scsi_ulto4b(blk, (u_int8_t *)scsi_cmd->blk_addr); scsi_ulto4b(len, (u_int8_t *)scsi_cmd->xfer_len); cdb_len = sizeof(*scsi_cmd); } cam_fill_csio(csio, /*retries*/ cd_retry_count, /*cbfcnp*/NULL, /*flags*/CAM_DIR_NONE, MSG_SIMPLE_Q_TAG, /*dataptr*/NULL, /*datalen*/0, /*sense_len*/SSD_FULL_SIZE, cdb_len, /*timeout*/50 * 1000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); return(error); } static int cdplaymsf(struct cam_periph *periph, u_int32_t startm, u_int32_t starts, u_int32_t startf, u_int32_t endm, u_int32_t ends, u_int32_t endf) { struct scsi_play_msf *scsi_cmd; struct ccb_scsiio *csio; union ccb *ccb; int error; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; cam_fill_csio(csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* flags */ CAM_DIR_NONE, /* tag_action */ MSG_SIMPLE_Q_TAG, /* data_ptr */ NULL, /* dxfer_len */ 0, /* sense_len */ SSD_FULL_SIZE, sizeof(struct scsi_play_msf), /* timeout */ 50000); scsi_cmd = (struct scsi_play_msf *)&csio->cdb_io.cdb_bytes; bzero (scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->op_code = PLAY_MSF; scsi_cmd->start_m = startm; scsi_cmd->start_s = starts; scsi_cmd->start_f = startf; scsi_cmd->end_m = endm; scsi_cmd->end_s = ends; scsi_cmd->end_f = endf; error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); return(error); } static int cdplaytracks(struct cam_periph *periph, u_int32_t strack, u_int32_t sindex, u_int32_t etrack, u_int32_t eindex) { struct scsi_play_track *scsi_cmd; struct ccb_scsiio *csio; union ccb *ccb; int error; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; cam_fill_csio(csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* flags */ CAM_DIR_NONE, /* tag_action */ MSG_SIMPLE_Q_TAG, /* data_ptr */ NULL, /* dxfer_len */ 0, /* sense_len */ SSD_FULL_SIZE, sizeof(struct scsi_play_track), /* timeout */ 50000); scsi_cmd = (struct scsi_play_track *)&csio->cdb_io.cdb_bytes; bzero (scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->op_code = PLAY_TRACK; scsi_cmd->start_track = strack; scsi_cmd->start_index = sindex; scsi_cmd->end_track = etrack; scsi_cmd->end_index = eindex; error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); return(error); } static int cdpause(struct cam_periph *periph, u_int32_t go) { struct scsi_pause *scsi_cmd; struct ccb_scsiio *csio; union ccb *ccb; int error; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; cam_fill_csio(csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* flags */ CAM_DIR_NONE, /* tag_action */ MSG_SIMPLE_Q_TAG, /* data_ptr */ NULL, /* dxfer_len */ 0, /* sense_len */ SSD_FULL_SIZE, sizeof(struct scsi_pause), /* timeout */ 50000); scsi_cmd = (struct scsi_pause *)&csio->cdb_io.cdb_bytes; bzero (scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->op_code = PAUSE; scsi_cmd->resume = go; error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); return(error); } static int cdstartunit(struct cam_periph *periph, int load) { union ccb *ccb; int error; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_start_stop(&ccb->csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* start */ TRUE, /* load_eject */ load, /* immediate */ FALSE, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 50000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); return(error); } static int cdstopunit(struct cam_periph *periph, u_int32_t eject) { union ccb *ccb; int error; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_start_stop(&ccb->csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* start */ FALSE, /* load_eject */ eject, /* immediate */ FALSE, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 50000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); return(error); } static int cdsetspeed(struct cam_periph *periph, u_int32_t rdspeed, u_int32_t wrspeed) { struct scsi_set_speed *scsi_cmd; struct ccb_scsiio *csio; union ccb *ccb; int error; error = 0; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); csio = &ccb->csio; /* Preserve old behavior: units in multiples of CDROM speed */ if (rdspeed < 177) rdspeed *= 177; if (wrspeed < 177) wrspeed *= 177; cam_fill_csio(csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* flags */ CAM_DIR_NONE, /* tag_action */ MSG_SIMPLE_Q_TAG, /* data_ptr */ NULL, /* dxfer_len */ 0, /* sense_len */ SSD_FULL_SIZE, sizeof(struct scsi_set_speed), /* timeout */ 50000); scsi_cmd = (struct scsi_set_speed *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SET_CD_SPEED; scsi_ulto2b(rdspeed, scsi_cmd->readspeed); scsi_ulto2b(wrspeed, scsi_cmd->writespeed); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); return(error); } static int cdreportkey(struct cam_periph *periph, struct dvd_authinfo *authinfo) { union ccb *ccb; u_int8_t *databuf; u_int32_t lba; int error; int length; error = 0; databuf = NULL; lba = 0; switch (authinfo->format) { case DVD_REPORT_AGID: length = sizeof(struct scsi_report_key_data_agid); break; case DVD_REPORT_CHALLENGE: length = sizeof(struct scsi_report_key_data_challenge); break; case DVD_REPORT_KEY1: length = sizeof(struct scsi_report_key_data_key1_key2); break; case DVD_REPORT_TITLE_KEY: length = sizeof(struct scsi_report_key_data_title); /* The lba field is only set for the title key */ lba = authinfo->lba; break; case DVD_REPORT_ASF: length = sizeof(struct scsi_report_key_data_asf); break; case DVD_REPORT_RPC: length = sizeof(struct scsi_report_key_data_rpc); break; case DVD_INVALIDATE_AGID: length = 0; break; default: return (EINVAL); } if (length != 0) { databuf = malloc(length, M_DEVBUF, M_WAITOK | M_ZERO); } else databuf = NULL; cam_periph_lock(periph); ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_report_key(&ccb->csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* lba */ lba, /* agid */ authinfo->agid, /* key_format */ authinfo->format, /* data_ptr */ databuf, /* dxfer_len */ length, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 50000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); if (error != 0) goto bailout; if (ccb->csio.resid != 0) { xpt_print(periph->path, "warning, residual for report key " "command is %d\n", ccb->csio.resid); } switch(authinfo->format) { case DVD_REPORT_AGID: { struct scsi_report_key_data_agid *agid_data; agid_data = (struct scsi_report_key_data_agid *)databuf; authinfo->agid = (agid_data->agid & RKD_AGID_MASK) >> RKD_AGID_SHIFT; break; } case DVD_REPORT_CHALLENGE: { struct scsi_report_key_data_challenge *chal_data; chal_data = (struct scsi_report_key_data_challenge *)databuf; bcopy(chal_data->challenge_key, authinfo->keychal, min(sizeof(chal_data->challenge_key), sizeof(authinfo->keychal))); break; } case DVD_REPORT_KEY1: { struct scsi_report_key_data_key1_key2 *key1_data; key1_data = (struct scsi_report_key_data_key1_key2 *)databuf; bcopy(key1_data->key1, authinfo->keychal, min(sizeof(key1_data->key1), sizeof(authinfo->keychal))); break; } case DVD_REPORT_TITLE_KEY: { struct scsi_report_key_data_title *title_data; title_data = (struct scsi_report_key_data_title *)databuf; authinfo->cpm = (title_data->byte0 & RKD_TITLE_CPM) >> RKD_TITLE_CPM_SHIFT; authinfo->cp_sec = (title_data->byte0 & RKD_TITLE_CP_SEC) >> RKD_TITLE_CP_SEC_SHIFT; authinfo->cgms = (title_data->byte0 & RKD_TITLE_CMGS_MASK) >> RKD_TITLE_CMGS_SHIFT; bcopy(title_data->title_key, authinfo->keychal, min(sizeof(title_data->title_key), sizeof(authinfo->keychal))); break; } case DVD_REPORT_ASF: { struct scsi_report_key_data_asf *asf_data; asf_data = (struct scsi_report_key_data_asf *)databuf; authinfo->asf = asf_data->success & RKD_ASF_SUCCESS; break; } case DVD_REPORT_RPC: { struct scsi_report_key_data_rpc *rpc_data; rpc_data = (struct scsi_report_key_data_rpc *)databuf; authinfo->reg_type = (rpc_data->byte4 & RKD_RPC_TYPE_MASK) >> RKD_RPC_TYPE_SHIFT; authinfo->vend_rsts = (rpc_data->byte4 & RKD_RPC_VENDOR_RESET_MASK) >> RKD_RPC_VENDOR_RESET_SHIFT; authinfo->user_rsts = rpc_data->byte4 & RKD_RPC_USER_RESET_MASK; authinfo->region = rpc_data->region_mask; authinfo->rpc_scheme = rpc_data->rpc_scheme1; break; } case DVD_INVALIDATE_AGID: break; default: /* This should be impossible, since we checked above */ error = EINVAL; goto bailout; break; /* NOTREACHED */ } bailout: xpt_release_ccb(ccb); cam_periph_unlock(periph); if (databuf != NULL) free(databuf, M_DEVBUF); return(error); } static int cdsendkey(struct cam_periph *periph, struct dvd_authinfo *authinfo) { union ccb *ccb; u_int8_t *databuf; int length; int error; error = 0; databuf = NULL; switch(authinfo->format) { case DVD_SEND_CHALLENGE: { struct scsi_report_key_data_challenge *challenge_data; length = sizeof(*challenge_data); challenge_data = malloc(length, M_DEVBUF, M_WAITOK | M_ZERO); databuf = (u_int8_t *)challenge_data; scsi_ulto2b(length - sizeof(challenge_data->data_len), challenge_data->data_len); bcopy(authinfo->keychal, challenge_data->challenge_key, min(sizeof(authinfo->keychal), sizeof(challenge_data->challenge_key))); break; } case DVD_SEND_KEY2: { struct scsi_report_key_data_key1_key2 *key2_data; length = sizeof(*key2_data); key2_data = malloc(length, M_DEVBUF, M_WAITOK | M_ZERO); databuf = (u_int8_t *)key2_data; scsi_ulto2b(length - sizeof(key2_data->data_len), key2_data->data_len); bcopy(authinfo->keychal, key2_data->key1, min(sizeof(authinfo->keychal), sizeof(key2_data->key1))); break; } case DVD_SEND_RPC: { struct scsi_send_key_data_rpc *rpc_data; length = sizeof(*rpc_data); rpc_data = malloc(length, M_DEVBUF, M_WAITOK | M_ZERO); databuf = (u_int8_t *)rpc_data; scsi_ulto2b(length - sizeof(rpc_data->data_len), rpc_data->data_len); rpc_data->region_code = authinfo->region; break; } default: return (EINVAL); } cam_periph_lock(periph); ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_send_key(&ccb->csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* agid */ authinfo->agid, /* key_format */ authinfo->format, /* data_ptr */ databuf, /* dxfer_len */ length, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 50000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); xpt_release_ccb(ccb); cam_periph_unlock(periph); if (databuf != NULL) free(databuf, M_DEVBUF); return(error); } static int cdreaddvdstructure(struct cam_periph *periph, struct dvd_struct *dvdstruct) { union ccb *ccb; u_int8_t *databuf; u_int32_t address; int error; int length; error = 0; databuf = NULL; /* The address is reserved for many of the formats */ address = 0; switch(dvdstruct->format) { case DVD_STRUCT_PHYSICAL: length = sizeof(struct scsi_read_dvd_struct_data_physical); break; case DVD_STRUCT_COPYRIGHT: length = sizeof(struct scsi_read_dvd_struct_data_copyright); break; case DVD_STRUCT_DISCKEY: length = sizeof(struct scsi_read_dvd_struct_data_disc_key); break; case DVD_STRUCT_BCA: length = sizeof(struct scsi_read_dvd_struct_data_bca); break; case DVD_STRUCT_MANUFACT: length = sizeof(struct scsi_read_dvd_struct_data_manufacturer); break; case DVD_STRUCT_CMI: return (ENODEV); case DVD_STRUCT_PROTDISCID: length = sizeof(struct scsi_read_dvd_struct_data_prot_discid); break; case DVD_STRUCT_DISCKEYBLOCK: length = sizeof(struct scsi_read_dvd_struct_data_disc_key_blk); break; case DVD_STRUCT_DDS: length = sizeof(struct scsi_read_dvd_struct_data_dds); break; case DVD_STRUCT_MEDIUM_STAT: length = sizeof(struct scsi_read_dvd_struct_data_medium_status); break; case DVD_STRUCT_SPARE_AREA: length = sizeof(struct scsi_read_dvd_struct_data_spare_area); break; case DVD_STRUCT_RMD_LAST: return (ENODEV); case DVD_STRUCT_RMD_RMA: return (ENODEV); case DVD_STRUCT_PRERECORDED: length = sizeof(struct scsi_read_dvd_struct_data_leadin); break; case DVD_STRUCT_UNIQUEID: length = sizeof(struct scsi_read_dvd_struct_data_disc_id); break; case DVD_STRUCT_DCB: return (ENODEV); case DVD_STRUCT_LIST: /* * This is the maximum allocation length for the READ DVD * STRUCTURE command. There's nothing in the MMC3 spec * that indicates a limit in the amount of data that can * be returned from this call, other than the limits * imposed by the 2-byte length variables. */ length = 65535; break; default: return (EINVAL); } if (length != 0) { databuf = malloc(length, M_DEVBUF, M_WAITOK | M_ZERO); } else databuf = NULL; cam_periph_lock(periph); ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_read_dvd_structure(&ccb->csio, /* retries */ cd_retry_count, /* cbfcnp */ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* lba */ address, /* layer_number */ dvdstruct->layer_num, /* key_format */ dvdstruct->format, /* agid */ dvdstruct->agid, /* data_ptr */ databuf, /* dxfer_len */ length, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 50000); error = cdrunccb(ccb, cderror, /*cam_flags*/CAM_RETRY_SELTO, /*sense_flags*/SF_RETRY_UA); if (error != 0) goto bailout; switch(dvdstruct->format) { case DVD_STRUCT_PHYSICAL: { struct scsi_read_dvd_struct_data_layer_desc *inlayer; struct dvd_layer *outlayer; struct scsi_read_dvd_struct_data_physical *phys_data; phys_data = (struct scsi_read_dvd_struct_data_physical *)databuf; inlayer = &phys_data->layer_desc; outlayer = (struct dvd_layer *)&dvdstruct->data; dvdstruct->length = sizeof(*inlayer); outlayer->book_type = (inlayer->book_type_version & RDSD_BOOK_TYPE_MASK) >> RDSD_BOOK_TYPE_SHIFT; outlayer->book_version = (inlayer->book_type_version & RDSD_BOOK_VERSION_MASK); outlayer->disc_size = (inlayer->disc_size_max_rate & RDSD_DISC_SIZE_MASK) >> RDSD_DISC_SIZE_SHIFT; outlayer->max_rate = (inlayer->disc_size_max_rate & RDSD_MAX_RATE_MASK); outlayer->nlayers = (inlayer->layer_info & RDSD_NUM_LAYERS_MASK) >> RDSD_NUM_LAYERS_SHIFT; outlayer->track_path = (inlayer->layer_info & RDSD_TRACK_PATH_MASK) >> RDSD_TRACK_PATH_SHIFT; outlayer->layer_type = (inlayer->layer_info & RDSD_LAYER_TYPE_MASK); outlayer->linear_density = (inlayer->density & RDSD_LIN_DENSITY_MASK) >> RDSD_LIN_DENSITY_SHIFT; outlayer->track_density = (inlayer->density & RDSD_TRACK_DENSITY_MASK); outlayer->bca = (inlayer->bca & RDSD_BCA_MASK) >> RDSD_BCA_SHIFT; outlayer->start_sector = scsi_3btoul(inlayer->main_data_start); outlayer->end_sector = scsi_3btoul(inlayer->main_data_end); outlayer->end_sector_l0 = scsi_3btoul(inlayer->end_sector_layer0); break; } case DVD_STRUCT_COPYRIGHT: { struct scsi_read_dvd_struct_data_copyright *copy_data; copy_data = (struct scsi_read_dvd_struct_data_copyright *) databuf; dvdstruct->cpst = copy_data->cps_type; dvdstruct->rmi = copy_data->region_info; dvdstruct->length = 0; break; } default: /* * Tell the user what the overall length is, no matter * what we can actually fit in the data buffer. */ dvdstruct->length = length - ccb->csio.resid - sizeof(struct scsi_read_dvd_struct_data_header); /* * But only actually copy out the smaller of what we read * in or what the structure can take. */ bcopy(databuf + sizeof(struct scsi_read_dvd_struct_data_header), dvdstruct->data, min(sizeof(dvdstruct->data), dvdstruct->length)); break; } bailout: xpt_release_ccb(ccb); cam_periph_unlock(periph); if (databuf != NULL) free(databuf, M_DEVBUF); return(error); } void scsi_report_key(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int32_t lba, u_int8_t agid, u_int8_t key_format, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_report_key *scsi_cmd; scsi_cmd = (struct scsi_report_key *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = REPORT_KEY; scsi_ulto4b(lba, scsi_cmd->lba); scsi_ulto2b(dxfer_len, scsi_cmd->alloc_len); scsi_cmd->agid_keyformat = (agid << RK_KF_AGID_SHIFT) | (key_format & RK_KF_KEYFORMAT_MASK); cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len == 0) ? CAM_DIR_NONE : CAM_DIR_IN, tag_action, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_send_key(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t agid, u_int8_t key_format, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_send_key *scsi_cmd; scsi_cmd = (struct scsi_send_key *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SEND_KEY; scsi_ulto2b(dxfer_len, scsi_cmd->param_len); scsi_cmd->agid_keyformat = (agid << RK_KF_AGID_SHIFT) | (key_format & RK_KF_KEYFORMAT_MASK); cam_fill_csio(csio, retries, cbfcnp, /*flags*/ CAM_DIR_OUT, tag_action, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_read_dvd_structure(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int32_t address, u_int8_t layer_number, u_int8_t format, u_int8_t agid, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_read_dvd_structure *scsi_cmd; scsi_cmd = (struct scsi_read_dvd_structure *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = READ_DVD_STRUCTURE; scsi_ulto4b(address, scsi_cmd->address); scsi_cmd->layer_number = layer_number; scsi_cmd->format = format; scsi_ulto2b(dxfer_len, scsi_cmd->alloc_len); /* The AGID is the top two bits of this byte */ scsi_cmd->agid = agid << 6; cam_fill_csio(csio, retries, cbfcnp, /*flags*/ CAM_DIR_IN, tag_action, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_read_toc(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t byte1_flags, uint8_t format, uint8_t track, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_read_toc *scsi_cmd; scsi_cmd = (struct scsi_read_toc *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->op_code = READ_TOC; /* * The structure is counting from 1, the function counting from 0. * The spec counts from 0. In MMC-6, there is only one flag, the * MSF flag. But we put the whole byte in for a bit a future-proofing. */ scsi_cmd->byte2 = byte1_flags; scsi_cmd->format = format; scsi_cmd->from_track = track; scsi_ulto2b(dxfer_len, scsi_cmd->data_len); cam_fill_csio(csio, /* retries */ retries, /* cbfcnp */ cbfcnp, /* flags */ CAM_DIR_IN, /* tag_action */ tag_action, /* data_ptr */ data_ptr, /* dxfer_len */ dxfer_len, /* sense_len */ sense_len, sizeof(*scsi_cmd), /* timeout */ timeout); } diff --git a/sys/compat/freebsd32/freebsd32_ioctl.c b/sys/compat/freebsd32/freebsd32_ioctl.c index b3ed457f7132..789e7a77ba80 100644 --- a/sys/compat/freebsd32/freebsd32_ioctl.c +++ b/sys/compat/freebsd32/freebsd32_ioctl.c @@ -1,247 +1,246 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2008 David E. O'Brien * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include CTASSERT(sizeof(struct mem_range_op32) == 12); static int freebsd32_ioctl_memrange(struct thread *td, struct freebsd32_ioctl_args *uap, struct file *fp) { struct mem_range_op mro; struct mem_range_op32 mro32; int error; u_long com; if ((error = copyin(uap->data, &mro32, sizeof(mro32))) != 0) return (error); PTRIN_CP(mro32, mro, mo_desc); CP(mro32, mro, mo_arg[0]); CP(mro32, mro, mo_arg[1]); com = 0; switch (uap->com) { case MEMRANGE_GET32: com = MEMRANGE_GET; break; case MEMRANGE_SET32: com = MEMRANGE_SET; break; default: panic("%s: unknown MEMRANGE %#x", __func__, uap->com); } if ((error = fo_ioctl(fp, com, (caddr_t)&mro, td->td_ucred, td)) != 0) return (error); if ( (com & IOC_OUT) ) { CP(mro, mro32, mo_arg[0]); CP(mro, mro32, mo_arg[1]); error = copyout(&mro32, uap->data, sizeof(mro32)); } return (error); } static int freebsd32_ioctl_barmmap(struct thread *td, struct freebsd32_ioctl_args *uap, struct file *fp) { struct pci_bar_mmap32 pbm32; struct pci_bar_mmap pbm; int error; error = copyin(uap->data, &pbm32, sizeof(pbm32)); if (error != 0) return (error); PTRIN_CP(pbm32, pbm, pbm_map_base); CP(pbm32, pbm, pbm_sel); CP(pbm32, pbm, pbm_reg); CP(pbm32, pbm, pbm_flags); CP(pbm32, pbm, pbm_memattr); pbm.pbm_bar_length = PAIR32TO64(uint64_t, pbm32.pbm_bar_length); error = fo_ioctl(fp, PCIOCBARMMAP, (caddr_t)&pbm, td->td_ucred, td); if (error == 0) { PTROUT_CP(pbm, pbm32, pbm_map_base); CP(pbm, pbm32, pbm_map_length); #if BYTE_ORDER == LITTLE_ENDIAN pbm32.pbm_bar_length1 = pbm.pbm_bar_length; pbm32.pbm_bar_length2 = pbm.pbm_bar_length >> 32; #else pbm32.pbm_bar_length1 = pbm.pbm_bar_length >> 32; pbm32.pbm_bar_length2 = pbm.pbm_bar_length; #endif CP(pbm, pbm32, pbm_bar_off); error = copyout(&pbm32, uap->data, sizeof(pbm32)); } return (error); } static int freebsd32_ioctl_sg(struct thread *td, struct freebsd32_ioctl_args *uap, struct file *fp) { struct sg_io_hdr io; struct sg_io_hdr32 io32; int error; if ((error = copyin(uap->data, &io32, sizeof(io32))) != 0) return (error); CP(io32, io, interface_id); CP(io32, io, dxfer_direction); CP(io32, io, cmd_len); CP(io32, io, mx_sb_len); CP(io32, io, iovec_count); CP(io32, io, dxfer_len); PTRIN_CP(io32, io, dxferp); PTRIN_CP(io32, io, cmdp); PTRIN_CP(io32, io, sbp); CP(io32, io, timeout); CP(io32, io, flags); CP(io32, io, pack_id); PTRIN_CP(io32, io, usr_ptr); CP(io32, io, status); CP(io32, io, masked_status); CP(io32, io, msg_status); CP(io32, io, sb_len_wr); CP(io32, io, host_status); CP(io32, io, driver_status); CP(io32, io, resid); CP(io32, io, duration); CP(io32, io, info); if ((error = fo_ioctl(fp, SG_IO, (caddr_t)&io, td->td_ucred, td)) != 0) return (error); CP(io, io32, interface_id); CP(io, io32, dxfer_direction); CP(io, io32, cmd_len); CP(io, io32, mx_sb_len); CP(io, io32, iovec_count); CP(io, io32, dxfer_len); PTROUT_CP(io, io32, dxferp); PTROUT_CP(io, io32, cmdp); PTROUT_CP(io, io32, sbp); CP(io, io32, timeout); CP(io, io32, flags); CP(io, io32, pack_id); PTROUT_CP(io, io32, usr_ptr); CP(io, io32, status); CP(io, io32, masked_status); CP(io, io32, msg_status); CP(io, io32, sb_len_wr); CP(io, io32, host_status); CP(io, io32, driver_status); CP(io, io32, resid); CP(io, io32, duration); CP(io, io32, info); error = copyout(&io32, uap->data, sizeof(io32)); return (error); } int freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap) { struct ioctl_args ap /*{ int fd; u_long com; caddr_t data; }*/ ; struct file *fp; cap_rights_t rights; int error; error = fget(td, uap->fd, cap_rights_init_one(&rights, CAP_IOCTL), &fp); if (error != 0) return (error); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { fdrop(fp, td); return (EBADF); } switch (uap->com) { case MEMRANGE_GET32: /* FALLTHROUGH */ case MEMRANGE_SET32: error = freebsd32_ioctl_memrange(td, uap, fp); break; case SG_IO_32: error = freebsd32_ioctl_sg(td, uap, fp); break; case PCIOCBARMMAP_32: error = freebsd32_ioctl_barmmap(td, uap, fp); break; default: fdrop(fp, td); ap.fd = uap->fd; ap.com = uap->com; PTRIN_CP(*uap, ap, data); return sys_ioctl(td, &ap); } fdrop(fp, td); return (error); } diff --git a/sys/fs/nfsclient/nfs_clsubs.c b/sys/fs/nfsclient/nfs_clsubs.c index d361c175aa8e..7d45c213f6d3 100644 --- a/sys/fs/nfsclient/nfs_clsubs.c +++ b/sys/fs/nfsclient/nfs_clsubs.c @@ -1,391 +1,390 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from nfs_subs.c 8.8 (Berkeley) 5/22/95 */ #include __FBSDID("$FreeBSD$"); /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include extern struct mtx ncl_iod_mutex; extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON]; extern int ncl_numasync; extern unsigned int ncl_iodmax; extern struct nfsstatsv1 nfsstatsv1; struct task ncl_nfsiodnew_task; int ncl_uninit(struct vfsconf *vfsp) { /* * XXX: Unloading of nfscl module is unsupported. */ #if 0 int i; /* * Tell all nfsiod processes to exit. Clear ncl_iodmax, and wakeup * any sleeping nfsiods so they check ncl_iodmax and exit. */ NFSLOCKIOD(); ncl_iodmax = 0; for (i = 0; i < ncl_numasync; i++) if (ncl_iodwant[i] == NFSIOD_AVAILABLE) wakeup(&ncl_iodwant[i]); /* The last nfsiod to exit will wake us up when ncl_numasync hits 0 */ while (ncl_numasync) msleep(&ncl_numasync, &ncl_iod_mutex, PWAIT, "ioddie", 0); NFSUNLOCKIOD(); ncl_nhuninit(); return (0); #else return (EOPNOTSUPP); #endif } /* Returns with NFSLOCKNODE() held. */ void ncl_dircookie_lock(struct nfsnode *np) { NFSLOCKNODE(np); while (np->n_flag & NDIRCOOKIELK) (void) msleep(&np->n_flag, &np->n_mtx, PZERO, "nfsdirlk", 0); np->n_flag |= NDIRCOOKIELK; } void ncl_dircookie_unlock(struct nfsnode *np) { NFSLOCKNODE(np); np->n_flag &= ~NDIRCOOKIELK; wakeup(&np->n_flag); NFSUNLOCKNODE(np); } bool ncl_excl_start(struct vnode *vp) { struct nfsnode *np; int vn_lk; ASSERT_VOP_LOCKED(vp, "ncl_excl_start"); vn_lk = NFSVOPISLOCKED(vp); if (vn_lk == LK_EXCLUSIVE) return (false); KASSERT(vn_lk == LK_SHARED, ("ncl_excl_start: wrong vnode lock %d", vn_lk)); /* Ensure exclusive access, this might block */ np = VTONFS(vp); lockmgr(&np->n_excl, LK_EXCLUSIVE, NULL); return (true); } void ncl_excl_finish(struct vnode *vp, bool old_lock) { struct nfsnode *np; if (!old_lock) return; np = VTONFS(vp); lockmgr(&np->n_excl, LK_RELEASE, NULL); } #ifdef NFS_ACDEBUG #include SYSCTL_DECL(_vfs_nfs); static int nfs_acdebug; SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, ""); #endif /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int ncl_getattrcache(struct vnode *vp, struct vattr *vaper) { struct nfsnode *np; struct vattr *vap; struct nfsmount *nmp; int timeo, mustflush; u_quad_t nsize; bool setnsize; np = VTONFS(vp); vap = &np->n_vattr.na_vattr; nmp = VFSTONFS(vp->v_mount); mustflush = nfscl_mustflush(vp); /* must be before mtx_lock() */ NFSLOCKNODE(np); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime.tv_sec) / 10; #ifdef NFS_ACDEBUG if (nfs_acdebug>1) printf("ncl_getattrcache: initial timeo = %d\n", timeo); #endif if (vap->va_type == VDIR) { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin) timeo = nmp->nm_acdirmin; else if (timeo > nmp->nm_acdirmax) timeo = nmp->nm_acdirmax; } else { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin) timeo = nmp->nm_acregmin; else if (timeo > nmp->nm_acregmax) timeo = nmp->nm_acregmax; } #ifdef NFS_ACDEBUG if (nfs_acdebug > 2) printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n", nmp->nm_acregmin, nmp->nm_acregmax, nmp->nm_acdirmin, nmp->nm_acdirmax); if (nfs_acdebug) printf("ncl_getattrcache: age = %d; final timeo = %d\n", (time_second - np->n_attrstamp), timeo); #endif if ((time_second - np->n_attrstamp) >= timeo && (mustflush != 0 || np->n_attrstamp == 0)) { nfsstatsv1.attrcache_misses++; NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_GET_MISS(vp); return( ENOENT); } nfsstatsv1.attrcache_hits++; setnsize = false; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else { np->n_size = vap->va_size; } setnsize = ncl_pager_setsize(vp, &nsize); } else { np->n_size = vap->va_size; } } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } NFSUNLOCKNODE(np); if (setnsize) vnode_pager_setsize(vp, nsize); KDTRACE_NFS_ATTRCACHE_GET_HIT(vp, vap); return (0); } static nfsuint64 nfs_nullcookie = { { 0, 0 } }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * ncl_getcookie(struct nfsnode *np, off_t off, int add) { struct nfsdmap *dp, *dp2; int pos; nfsuint64 *retval = NULL; pos = (uoff_t)off / NFS_DIRBLKSIZ; if (pos == 0 || off < 0) { KASSERT(!add, ("nfs getcookie add at <= 0")); return (&nfs_nullcookie); } pos--; dp = LIST_FIRST(&np->n_cookies); if (!dp) { if (add) { dp = malloc(sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else goto out; } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (LIST_NEXT(dp, ndm_list)) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) goto out; dp = LIST_NEXT(dp, ndm_list); } else if (add) { dp2 = malloc(sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else goto out; } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else goto out; } retval = &dp->ndm_cookies[pos]; out: return (retval); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void ncl_invaldir(struct vnode *vp) { struct nfsnode *np = VTONFS(vp); KASSERT(vp->v_type == VDIR, ("nfs: invaldir not dir")); ncl_dircookie_lock(np); np->n_direofoffset = 0; NFSUNLOCKNODE(np); np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (LIST_FIRST(&np->n_cookies)) LIST_FIRST(&np->n_cookies)->ndm_eocookie = 0; ncl_dircookie_unlock(np); } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * and B_CLUSTEROK flags. Once done the new write verifier can be set for the * mount point. * * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data * writes are not clusterable. */ void ncl_clearcommit(struct mount *mp) { struct vnode *vp, *nvp; struct buf *bp, *nbp; struct bufobj *bo; MNT_VNODE_FOREACH_ALL(vp, mp, nvp) { bo = &vp->v_bufobj; vholdl(vp); VI_UNLOCK(vp); BO_LOCK(bo); TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (!BUF_ISLOCKED(bp) && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } BO_UNLOCK(bo); vdrop(vp); } } /* * Called once to initialize data structures... */ int ncl_init(struct vfsconf *vfsp) { int i; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { ncl_iodwant[i] = NFSIOD_NOT_AVAILABLE; ncl_iodmount[i] = NULL; } TASK_INIT(&ncl_nfsiodnew_task, 0, ncl_nfsiodnew_tq, NULL); ncl_nhinit(); /* Init the nfsnode table */ return (0); } diff --git a/sys/fs/procfs/procfs_dbregs.c b/sys/fs/procfs/procfs_dbregs.c index acc0f7f8c5ac..f7bf2b792625 100644 --- a/sys/fs/procfs/procfs_dbregs.c +++ b/sys/fs/procfs/procfs_dbregs.c @@ -1,133 +1,133 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999 Brian Scott Dean, brdean@unx.sas.com. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry under the following copyrights and conditions: * * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)procfs_regs.c 8.4 (Berkeley) 6/15/94 * * From: * $Id: procfs_regs.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ * $FreeBSD$ */ #include #include #include #include #include #include -#include #include #include #include #ifdef COMPAT_FREEBSD32 +#include #include #include /* * PROC(write, dbregs, td2, &r) becomes * proc_write_dbregs(td2, &r) or * proc_write_dbregs32(td2, &r32) * * UIOMOVE_FROMBUF(r, uio) becomes * uiomove_frombuf(&r, sizeof(r), uio) or * uiomove_frombuf(&r32, sizeof(r32), uio) */ #define PROC(d, w, t, r) wrap32 ? \ proc_ ## d ## _ ## w ## 32(t, r ## 32) : \ proc_ ## d ## _ ## w(t, r) #define UIOMOVE_FROMBUF(k, u) wrap32 ? \ uiomove_frombuf(& k ## 32, sizeof(k ## 32), u) : \ uiomove_frombuf(& k, sizeof(k), u) #else #define PROC(d, w, t, r) proc_ ## d ## _ ## w(t, r) #define UIOMOVE_FROMBUF(k, u) uiomove_frombuf(& k, sizeof(k), u) #endif int procfs_doprocdbregs(PFS_FILL_ARGS) { int error; struct dbreg r; struct thread *td2; #ifdef COMPAT_FREEBSD32 struct dbreg32 r32; int wrap32 = 0; #endif if (uio->uio_offset != 0) return (0); PROC_LOCK(p); PROC_ASSERT_HELD(p); if (p_candebug(td, p) != 0) { PROC_UNLOCK(p); return (EPERM); } td2 = FIRST_THREAD_IN_PROC(p); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { if (SV_PROC_FLAG(td2->td_proc, SV_ILP32) == 0) { PROC_UNLOCK(p); return (EINVAL); } wrap32 = 1; memset(&r32, 0, sizeof(r32)); } else #endif memset(&r, 0, sizeof(r)); error = PROC(read, dbregs, td2, &r); if (error == 0) { PROC_UNLOCK(p); error = UIOMOVE_FROMBUF(r, uio); PROC_LOCK(p); } if (error == 0 && uio->uio_rw == UIO_WRITE) { if (!P_SHOULDSTOP(p)) /* XXXKSE should be P_TRACED? */ error = EBUSY; else /* XXXKSE: */ error = PROC(write, dbregs, td2, &r); } PROC_UNLOCK(p); return (error); } diff --git a/sys/fs/procfs/procfs_fpregs.c b/sys/fs/procfs/procfs_fpregs.c index 1118eb68dcbc..2d680fe2621d 100644 --- a/sys/fs/procfs/procfs_fpregs.c +++ b/sys/fs/procfs/procfs_fpregs.c @@ -1,131 +1,131 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_fpregs.c 8.2 (Berkeley) 6/15/94 * * From: * $Id: procfs_regs.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ * $FreeBSD$ */ #include #include #include #include #include #include -#include #include #include #include #ifdef COMPAT_FREEBSD32 +#include #include #include /* * PROC(write, fpregs, td2, &r) becomes * proc_write_fpregs(td2, &r) or * proc_write_fpregs32(td2, &r32) * * UIOMOVE_FROMBUF(r, uio) becomes * uiomove_frombuf(&r, sizeof(r), uio) or * uiomove_frombuf(&r32, sizeof(r32), uio) */ #define PROC(d, w, t, r) wrap32 ? \ proc_ ## d ## _ ## w ## 32(t, r ## 32) : \ proc_ ## d ## _ ## w(t, r) #define UIOMOVE_FROMBUF(k, u) wrap32 ? \ uiomove_frombuf(& k ## 32, sizeof(k ## 32), u) : \ uiomove_frombuf(& k, sizeof(k), u) #else #define PROC(d, w, t, r) proc_ ## d ## _ ## w(t, r) #define UIOMOVE_FROMBUF(k, u) uiomove_frombuf(& k, sizeof(k), u) #endif int procfs_doprocfpregs(PFS_FILL_ARGS) { int error; struct fpreg r; struct thread *td2; #ifdef COMPAT_FREEBSD32 struct fpreg32 r32; int wrap32 = 0; #endif if (uio->uio_offset != 0) return (0); PROC_LOCK(p); PROC_ASSERT_HELD(p); if (p_candebug(td, p)) { PROC_UNLOCK(p); return (EPERM); } if (!P_SHOULDSTOP(p)) { PROC_UNLOCK(p); return (EBUSY); } td2 = FIRST_THREAD_IN_PROC(p); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { if (SV_PROC_FLAG(td2->td_proc, SV_ILP32) == 0) { PROC_UNLOCK(p); return (EINVAL); } wrap32 = 1; memset(&r32, 0, sizeof(r32)); } else #endif memset(&r, 0, sizeof(r)); error = PROC(read, fpregs, td2, &r); if (error == 0) { PROC_UNLOCK(p); error = UIOMOVE_FROMBUF(r, uio); PROC_LOCK(p); } if (error == 0 && uio->uio_rw == UIO_WRITE) { if (!P_SHOULDSTOP(p)) error = EBUSY; else /* XXXKSE: */ error = PROC(write, fpregs, td2, &r); } PROC_UNLOCK(p); return (error); } diff --git a/sys/fs/procfs/procfs_regs.c b/sys/fs/procfs/procfs_regs.c index 703dad64a182..fd5d0cb2a209 100644 --- a/sys/fs/procfs/procfs_regs.c +++ b/sys/fs/procfs/procfs_regs.c @@ -1,131 +1,131 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_regs.c 8.4 (Berkeley) 6/15/94 * * From: * $Id: procfs_regs.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ * $FreeBSD$ */ #include #include #include #include #include #include -#include #include #include #include #ifdef COMPAT_FREEBSD32 #include +#include #include /* * PROC(write, regs, td2, &r) becomes * proc_write_regs(td2, &r) or * proc_write_regs32(td2, &r32) * * UIOMOVE_FROMBUF(r, uio) becomes * uiomove_frombuf(&r, sizeof(r), uio) or * uiomove_frombuf(&r32, sizeof(r32), uio) */ #define PROC(d, w, t, r) wrap32 ? \ proc_ ## d ## _ ## w ## 32(t, r ## 32) : \ proc_ ## d ## _ ## w(t, r) #define UIOMOVE_FROMBUF(k, u) wrap32 ? \ uiomove_frombuf(& k ## 32, sizeof(k ## 32), u) : \ uiomove_frombuf(& k, sizeof(k), u) #else #define PROC(d, w, t, r) proc_ ## d ## _ ## w(t, r) #define UIOMOVE_FROMBUF(k, u) uiomove_frombuf(& k, sizeof(k), u) #endif int procfs_doprocregs(PFS_FILL_ARGS) { int error; struct reg r; struct thread *td2; #ifdef COMPAT_FREEBSD32 struct reg32 r32; int wrap32 = 0; #endif if (uio->uio_offset != 0) return (0); PROC_LOCK(p); PROC_ASSERT_HELD(p); if (p_candebug(td, p)) { PROC_UNLOCK(p); return (EPERM); } if (!P_SHOULDSTOP(p)) { PROC_UNLOCK(p); return (EBUSY); } td2 = FIRST_THREAD_IN_PROC(p); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { if ((SV_PROC_FLAG(td2->td_proc, SV_ILP32)) == 0) { PROC_UNLOCK(p); return (EINVAL); } wrap32 = 1; memset(&r32, 0, sizeof(r32)); } else #endif memset(&r, 0, sizeof(r)); error = PROC(read, regs, td2, &r); if (error == 0) { PROC_UNLOCK(p); error = UIOMOVE_FROMBUF(r, uio); PROC_LOCK(p); } if (error == 0 && uio->uio_rw == UIO_WRITE) { if (!P_SHOULDSTOP(p)) error = EBUSY; else /* XXXKSE: */ error = PROC(write, regs, td2, &r); } PROC_UNLOCK(p); return (error); } diff --git a/sys/fs/procfs/procfs_status.c b/sys/fs/procfs/procfs_status.c index e4e7e51ac02a..4592f458058d 100644 --- a/sys/fs/procfs/procfs_status.c +++ b/sys/fs/procfs/procfs_status.c @@ -1,201 +1,200 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_status.c 8.4 (Berkeley) 6/15/94 * * From: * $Id: procfs_status.c,v 3.1 1993/12/15 09:40:17 jsp Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include int procfs_doprocstatus(PFS_FILL_ARGS) { struct session *sess; struct thread *tdfirst; struct tty *tp; struct ucred *cr; const char *wmesg; char *pc; char *sep; struct timeval boottime; int pid, ppid, pgid, sid; int i; pid = p->p_pid; PROC_LOCK(p); ppid = p->p_pptr ? p->p_pptr->p_pid : 0; pgid = p->p_pgrp->pg_id; sess = p->p_pgrp->pg_session; SESS_LOCK(sess); sid = sess->s_leader ? sess->s_leader->p_pid : 0; /* comm pid ppid pgid sid tty ctty,sldr start ut st wmsg euid ruid rgid,egid,groups[1 .. ngroups] */ pc = p->p_comm; do { if (*pc < 33 || *pc > 126 || *pc == '\\') sbuf_printf(sb, "\\%03o", *pc); else sbuf_putc(sb, *pc); } while (*++pc); sbuf_printf(sb, " %d %d %d %d ", pid, ppid, pgid, sid); if ((p->p_flag & P_CONTROLT) && (tp = sess->s_ttyp)) sbuf_printf(sb, "%s ", devtoname(tp->t_dev)); else sbuf_printf(sb, "- "); sep = ""; if (sess->s_ttyvp) { sbuf_printf(sb, "%sctty", sep); sep = ","; } if (SESS_LEADER(p)) { sbuf_printf(sb, "%ssldr", sep); sep = ","; } SESS_UNLOCK(sess); if (*sep != ',') { sbuf_printf(sb, "noflags"); } tdfirst = FIRST_THREAD_IN_PROC(p); thread_lock(tdfirst); if (tdfirst->td_wchan != NULL) { KASSERT(tdfirst->td_wmesg != NULL, ("wchan %p has no wmesg", tdfirst->td_wchan)); wmesg = tdfirst->td_wmesg; } else wmesg = "nochan"; thread_unlock(tdfirst); if (p->p_flag & P_INMEM) { struct timeval start, ut, st; PROC_STATLOCK(p); calcru(p, &ut, &st); PROC_STATUNLOCK(p); start = p->p_stats->p_start; getboottime(&boottime); timevaladd(&start, &boottime); sbuf_printf(sb, " %jd,%ld %jd,%ld %jd,%ld", (intmax_t)start.tv_sec, start.tv_usec, (intmax_t)ut.tv_sec, ut.tv_usec, (intmax_t)st.tv_sec, st.tv_usec); } else sbuf_printf(sb, " -1,-1 -1,-1 -1,-1"); sbuf_printf(sb, " %s", wmesg); cr = p->p_ucred; sbuf_printf(sb, " %lu %lu %lu", (u_long)cr->cr_uid, (u_long)cr->cr_ruid, (u_long)cr->cr_rgid); /* egid (cr->cr_svgid) is equal to cr_ngroups[0] see also getegid(2) in /sys/kern/kern_prot.c */ for (i = 0; i < cr->cr_ngroups; i++) { sbuf_printf(sb, ",%lu", (u_long)cr->cr_groups[i]); } if (jailed(cr)) { mtx_lock(&cr->cr_prison->pr_mtx); sbuf_printf(sb, " %s", prison_name(td->td_ucred->cr_prison, cr->cr_prison)); mtx_unlock(&cr->cr_prison->pr_mtx); } else { sbuf_printf(sb, " -"); } PROC_UNLOCK(p); sbuf_printf(sb, "\n"); return (0); } int procfs_doproccmdline(PFS_FILL_ARGS) { /* * If we are using the ps/cmdline caching, use that. Otherwise * read argv from the process space. * Note that if the argv is no longer available, we deliberately * don't fall back on p->p_comm or return an error: the authentic * Linux behaviour is to return zero-length in this case. */ PROC_LOCK(p); if (p->p_args && p_cansee(td, p) == 0) { sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length); PROC_UNLOCK(p); return (0); } if ((p->p_flag & P_SYSTEM) != 0) { PROC_UNLOCK(p); return (0); } PROC_UNLOCK(p); return (proc_getargv(td, p, sb)); } diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c index 2a8d2f892807..447fedeb1537 100644 --- a/sys/kern/kern_acct.c +++ b/sys/kern/kern_acct.c @@ -1,645 +1,644 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1994 Christopher G. Demetriou * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include _Static_assert(sizeof(struct acctv3) - offsetof(struct acctv3, ac_trailer) == sizeof(struct acctv2) - offsetof(struct acctv2, ac_trailer), "trailer"); _Static_assert(sizeof(struct acctv3) - offsetof(struct acctv3, ac_len2) == sizeof(struct acctv2) - offsetof(struct acctv2, ac_len2), "len2"); /* * The routines implemented in this file are described in: * Leffler, et al.: The Design and Implementation of the 4.3BSD * UNIX Operating System (Addison Welley, 1989) * on pages 62-63. * On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction * compt_t representation described in the above reference was replaced * with that of IEEE-754 floats. * * Arguably, to simplify accounting operations, this mechanism should * be replaced by one in which an accounting log file (similar to /dev/klog) * is read by a user process, etc. However, that has its own problems. */ /* Floating point definitions from . */ #define FLT_MANT_DIG 24 /* p */ #define FLT_MAX_EXP 128 /* emax */ /* * Internal accounting functions. * The former's operation is described in Leffler, et al., and the latter * was provided by UCB with the 4.4BSD-Lite release */ static uint32_t encode_timeval(struct timeval); static uint32_t encode_long(long); static void acctwatch(void); static void acct_thread(void *); static int acct_disable(struct thread *, int); /* * Accounting vnode pointer, saved vnode pointer, and flags for each. * acct_sx protects against changes to the active vnode and credentials * while accounting records are being committed to disk. */ static int acct_configured; static int acct_suspended; static struct vnode *acct_vp; static struct ucred *acct_cred; static int acct_flags; static struct sx acct_sx; SX_SYSINIT(acct, &acct_sx, "acct_sx"); /* * State of the accounting kthread. */ static int acct_state; #define ACCT_RUNNING 1 /* Accounting kthread is running. */ #define ACCT_EXITREQ 2 /* Accounting kthread should exit. */ /* * Values associated with enabling and disabling accounting */ static int acctsuspend = 2; /* stop accounting when < 2% free space left */ SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW, &acctsuspend, 0, "percentage of free disk space below which accounting stops"); static int acctresume = 4; /* resume when free space risen to > 4% */ SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW, &acctresume, 0, "percentage of free disk space above which accounting resumes"); static int acctchkfreq = 15; /* frequency (in seconds) to check space */ static int sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS) { int error, value; /* Write out the old value. */ error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int)); if (error || req->newptr == NULL) return (error); /* Read in and verify the new value. */ error = SYSCTL_IN(req, &value, sizeof(int)); if (error) return (error); if (value <= 0) return (EINVAL); acctchkfreq = value; return (0); } SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &acctchkfreq, 0, sysctl_acct_chkfreq, "I", "frequency for checking the free space"); SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0, "Accounting configured or not"); SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0, "Accounting suspended or not"); /* * Accounting system call. Written based on the specification and previous * implementation done by Mark Tinguely. */ int sys_acct(struct thread *td, struct acct_args *uap) { struct nameidata nd; int error, flags, replacing; error = priv_check(td, PRIV_ACCT); if (error) return (error); /* * If accounting is to be started to a file, open that file for * appending and make sure it's a 'normal'. */ if (uap->path != NULL) { NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); flags = FWRITE | O_APPEND; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC error = mac_system_check_acct(td->td_ucred, nd.ni_vp); if (error) { VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, flags, td->td_ucred, td); return (error); } #endif VOP_UNLOCK(nd.ni_vp); if (nd.ni_vp->v_type != VREG) { vn_close(nd.ni_vp, flags, td->td_ucred, td); return (EACCES); } #ifdef MAC } else { error = mac_system_check_acct(td->td_ucred, NULL); if (error) return (error); #endif } /* * Disallow concurrent access to the accounting vnode while we swap * it out, in order to prevent access after close. */ sx_xlock(&acct_sx); /* * Don't log spurious disable/enable messages if we are * switching from one accounting file to another due to log * rotation. */ replacing = (acct_vp != NULL && uap->path != NULL); /* * If accounting was previously enabled, kill the old space-watcher, * close the file, and (if no new file was specified, leave). Reset * the suspended state regardless of whether accounting remains * enabled. */ acct_suspended = 0; if (acct_vp != NULL) error = acct_disable(td, !replacing); if (uap->path == NULL) { if (acct_state & ACCT_RUNNING) { acct_state |= ACCT_EXITREQ; wakeup(&acct_state); } sx_xunlock(&acct_sx); return (error); } /* * Save the new accounting file vnode, and schedule the new * free space watcher. */ acct_vp = nd.ni_vp; acct_cred = crhold(td->td_ucred); acct_flags = flags; if (acct_state & ACCT_RUNNING) acct_state &= ~ACCT_EXITREQ; else { /* * Try to start up an accounting kthread. We may start more * than one, but if so the extras will commit suicide as * soon as they start up. */ error = kproc_create(acct_thread, NULL, NULL, 0, 0, "accounting"); if (error) { (void) acct_disable(td, 0); sx_xunlock(&acct_sx); log(LOG_NOTICE, "Unable to start accounting thread\n"); return (error); } } acct_configured = 1; sx_xunlock(&acct_sx); if (!replacing) log(LOG_NOTICE, "Accounting enabled\n"); return (error); } /* * Disable currently in-progress accounting by closing the vnode, dropping * our reference to the credential, and clearing the vnode's flags. */ static int acct_disable(struct thread *td, int logging) { int error; sx_assert(&acct_sx, SX_XLOCKED); error = vn_close(acct_vp, acct_flags, acct_cred, td); crfree(acct_cred); acct_configured = 0; acct_vp = NULL; acct_cred = NULL; acct_flags = 0; if (logging) log(LOG_NOTICE, "Accounting disabled\n"); return (error); } /* * Write out process accounting information, on process exit. * Data to be written out is specified in Leffler, et al. * and are enumerated below. (They're also noted in the system * "acct.h" header file.) */ int acct_process(struct thread *td) { struct acctv3 acct; struct timeval ut, st, tmp; struct proc *p; struct rusage ru; int t, ret; /* * Lockless check of accounting condition before doing the hard * work. */ if (acct_vp == NULL || acct_suspended) return (0); memset(&acct, 0, sizeof(acct)); sx_slock(&acct_sx); /* * If accounting isn't enabled, don't bother. Have to check again * once we own the lock in case we raced with disabling of accounting * by another thread. */ if (acct_vp == NULL || acct_suspended) { sx_sunlock(&acct_sx); return (0); } p = td->td_proc; td->td_pflags2 |= TDP2_ACCT; /* * Get process accounting information. */ sx_slock(&proctree_lock); PROC_LOCK(p); /* (1) The terminal from which the process was started */ if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp); else acct.ac_tty = NODEV; sx_sunlock(&proctree_lock); /* (2) The name of the command that ran */ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); /* (3) The amount of user and system time that was used */ rufetchcalc(p, &ru, &ut, &st); acct.ac_utime = encode_timeval(ut); acct.ac_stime = encode_timeval(st); /* (4) The elapsed time the command ran (and its starting time) */ getboottime(&tmp); timevaladd(&tmp, &p->p_stats->p_start); acct.ac_btime = tmp.tv_sec; microuptime(&tmp); timevalsub(&tmp, &p->p_stats->p_start); acct.ac_etime = encode_timeval(tmp); /* (5) The average amount of memory used */ tmp = ut; timevaladd(&tmp, &st); /* Convert tmp (i.e. u + s) into hz units to match ru_i*. */ t = tmp.tv_sec * hz + tmp.tv_usec / tick; if (t) acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss + + ru.ru_isrss) / t); else acct.ac_mem = 0; /* (6) The number of disk I/O operations done */ acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock); /* (7) The UID and GID of the process */ acct.ac_uid = p->p_ucred->cr_ruid; acct.ac_gid = p->p_ucred->cr_rgid; /* (8) The boolean flags that tell how the process terminated, etc. */ acct.ac_flagx = p->p_acflag; PROC_UNLOCK(p); /* Setup ancillary structure fields. */ acct.ac_flagx |= ANVER; acct.ac_zero = 0; acct.ac_version = 3; acct.ac_len = acct.ac_len2 = sizeof(acct); /* * Write the accounting information to the file. */ ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct), (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NOCRED, NULL, td); sx_sunlock(&acct_sx); td->td_pflags2 &= ~TDP2_ACCT; return (ret); } /* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */ /* Convert timevals and longs into IEEE-754 bit patterns. */ /* Mantissa mask (MSB is implied, so subtract 1). */ #define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1) /* * We calculate integer values to a precision of approximately * 28 bits. * This is high-enough precision to fill the 24 float bits * and low-enough to avoid overflowing the 32 int bits. */ #define CALC_BITS 28 /* log_2(1000000). */ #define LOG2_1M 20 /* * Convert the elements of a timeval into a 32-bit word holding * the bits of a IEEE-754 float. * The float value represents the timeval's value in microsecond units. */ static uint32_t encode_timeval(struct timeval tv) { int log2_s; int val, exp; /* Unnormalized value and exponent */ int norm_exp; /* Normalized exponent */ int shift; /* * First calculate value and exponent to about CALC_BITS precision. * Note that the following conditionals have been ordered so that * the most common cases appear first. */ if (tv.tv_sec == 0) { if (tv.tv_usec == 0) return (0); exp = 0; val = tv.tv_usec; } else { /* * Calculate the value to a precision of approximately * CALC_BITS. */ log2_s = fls(tv.tv_sec) - 1; if (log2_s + LOG2_1M < CALC_BITS) { exp = 0; val = 1000000 * tv.tv_sec + tv.tv_usec; } else { exp = log2_s + LOG2_1M - CALC_BITS; val = (unsigned int)(((uint64_t)1000000 * tv.tv_sec + tv.tv_usec) >> exp); } } /* Now normalize and pack the value into an IEEE-754 float. */ norm_exp = fls(val) - 1; shift = FLT_MANT_DIG - norm_exp - 1; #ifdef ACCT_DEBUG printf("val=%d exp=%d shift=%d log2(val)=%d\n", val, exp, shift, norm_exp); printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp, ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK)); #endif return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) | ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK)); } /* * Convert a non-negative long value into the bit pattern of * an IEEE-754 float value. */ static uint32_t encode_long(long val) { int norm_exp; /* Normalized exponent */ int shift; if (val == 0) return (0); if (val < 0) { log(LOG_NOTICE, "encode_long: negative value %ld in accounting record\n", val); val = LONG_MAX; } norm_exp = fls(val) - 1; shift = FLT_MANT_DIG - norm_exp - 1; #ifdef ACCT_DEBUG printf("val=%d shift=%d log2(val)=%d\n", val, shift, norm_exp); printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp, ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK)); #endif return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) | ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK)); } /* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */ /* * Periodically check the filesystem to see if accounting * should be turned on or off. Beware the case where the vnode * has been vgone()'d out from underneath us, e.g. when the file * system containing the accounting file has been forcibly unmounted. */ /* ARGSUSED */ static void acctwatch(void) { struct statfs *sp; sx_assert(&acct_sx, SX_XLOCKED); /* * If accounting was disabled before our kthread was scheduled, * then acct_vp might be NULL. If so, just ask our kthread to * exit and return. */ if (acct_vp == NULL) { acct_state |= ACCT_EXITREQ; return; } /* * If our vnode is no longer valid, tear it down and signal the * accounting thread to die. */ if (acct_vp->v_type == VBAD) { (void) acct_disable(NULL, 1); acct_state |= ACCT_EXITREQ; return; } /* * Stopping here is better than continuing, maybe it will be VBAD * next time around. */ sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); if (VFS_STATFS(acct_vp->v_mount, sp) < 0) { free(sp, M_STATFS); return; } if (acct_suspended) { if (sp->f_bavail > (int64_t)(acctresume * sp->f_blocks / 100)) { acct_suspended = 0; log(LOG_NOTICE, "Accounting resumed\n"); } } else { if (sp->f_bavail <= (int64_t)(acctsuspend * sp->f_blocks / 100)) { acct_suspended = 1; log(LOG_NOTICE, "Accounting suspended\n"); } } free(sp, M_STATFS); } /* * The main loop for the dedicated kernel thread that periodically calls * acctwatch(). */ static void acct_thread(void *dummy) { u_char pri; /* This is a low-priority kernel thread. */ pri = PRI_MAX_KERN; thread_lock(curthread); sched_prio(curthread, pri); thread_unlock(curthread); /* If another accounting kthread is already running, just die. */ sx_xlock(&acct_sx); if (acct_state & ACCT_RUNNING) { sx_xunlock(&acct_sx); kproc_exit(0); } acct_state |= ACCT_RUNNING; /* Loop until we are asked to exit. */ while (!(acct_state & ACCT_EXITREQ)) { /* Perform our periodic checks. */ acctwatch(); /* * We check this flag again before sleeping since the * acctwatch() might have shut down accounting and asked us * to exit. */ if (!(acct_state & ACCT_EXITREQ)) { sx_sleep(&acct_state, &acct_sx, 0, "-", acctchkfreq * hz); } } /* * Acknowledge the exit request and shutdown. We clear both the * exit request and running flags. */ acct_state = 0; sx_xunlock(&acct_sx); kproc_exit(0); } diff --git a/sys/kern/kern_context.c b/sys/kern/kern_context.c index 516c54b75852..8612c95ae641 100644 --- a/sys/kern/kern_context.c +++ b/sys/kern/kern_context.c @@ -1,131 +1,130 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Daniel M. Eischen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include -#include #include #include #include #include /* * The first two fields of a ucontext_t are the signal mask and the machine * context. The next field is uc_link; we want to avoid destroying the link * when copying out contexts. */ #define UC_COPY_SIZE offsetof(ucontext_t, uc_link) #ifndef _SYS_SYSPROTO_H_ struct getcontext_args { struct __ucontext *ucp; } struct setcontext_args { const struct __ucontext_t *ucp; } struct swapcontext_args { struct __ucontext *oucp; const struct __ucontext_t *ucp; } #endif int sys_getcontext(struct thread *td, struct getcontext_args *uap) { ucontext_t uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { bzero(&uc, sizeof(ucontext_t)); get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->ucp, UC_COPY_SIZE); } return (ret); } int sys_setcontext(struct thread *td, struct setcontext_args *uap) { ucontext_t uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } return (ret == 0 ? EJUSTRETURN : ret); } int sys_swapcontext(struct thread *td, struct swapcontext_args *uap) { ucontext_t uc; int ret; if (uap->oucp == NULL || uap->ucp == NULL) ret = EINVAL; else { bzero(&uc, sizeof(ucontext_t)); get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->oucp, UC_COPY_SIZE); if (ret == 0) { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } } return (ret == 0 ? EJUSTRETURN : ret); } diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c index 2a4c62d64a0f..77fdf833c530 100644 --- a/sys/kern/kern_environment.c +++ b/sys/kern/kern_environment.c @@ -1,1127 +1,1126 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The unified bootloader passes us a pointer to a preserved copy of * bootstrap/kernel environment variables. We convert them to a * dynamic array of strings later when the VM subsystem is up. * * We make these available through the kenv(2) syscall for userland * and through kern_getenv()/freeenv() kern_setenv() kern_unsetenv() testenv() for * the kernel. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include static char *_getenv_dynamic_locked(const char *name, int *idx); static char *_getenv_dynamic(const char *name, int *idx); static char *kenv_acquire(const char *name); static void kenv_release(const char *buf); static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment"); #define KENV_SIZE 512 /* Maximum number of environment strings */ static uma_zone_t kenv_zone; static int kenv_mvallen = KENV_MVALLEN; /* pointer to the config-generated static environment */ char *kern_envp; /* pointer to the md-static environment */ char *md_envp; static int md_env_len; static int md_env_pos; static char *kernenv_next(char *); /* dynamic environment variables */ char **kenvp; struct mtx kenv_lock; /* * No need to protect this with a mutex since SYSINITS are single threaded. */ bool dynamic_kenv; #define KENV_CHECK if (!dynamic_kenv) \ panic("%s: called before SI_SUB_KMEM", __func__) static int kenv_dump(struct thread *td, char **envp, int what, char *value, int len) { char *buffer, *senv; size_t done, needed, buflen; int error; error = 0; buffer = NULL; done = needed = 0; MPASS(what == KENV_DUMP || what == KENV_DUMP_LOADER || what == KENV_DUMP_STATIC); /* * For non-dynamic kernel environment, we pass in either md_envp or * kern_envp and we must traverse with kernenv_next(). This shuffling * of pointers simplifies the below loop by only differing in how envp * is modified. */ if (what != KENV_DUMP) { senv = (char *)envp; envp = &senv; } buflen = len; if (buflen > KENV_SIZE * (KENV_MNAMELEN + kenv_mvallen + 2)) buflen = KENV_SIZE * (KENV_MNAMELEN + kenv_mvallen + 2); if (len > 0 && value != NULL) buffer = malloc(buflen, M_TEMP, M_WAITOK|M_ZERO); /* Only take the lock for the dynamic kenv. */ if (what == KENV_DUMP) mtx_lock(&kenv_lock); while (*envp != NULL) { len = strlen(*envp) + 1; needed += len; len = min(len, buflen - done); /* * If called with a NULL or insufficiently large * buffer, just keep computing the required size. */ if (value != NULL && buffer != NULL && len > 0) { bcopy(*envp, buffer + done, len); done += len; } /* Advance the pointer depending on the kenv format. */ if (what == KENV_DUMP) envp++; else senv = kernenv_next(senv); } if (what == KENV_DUMP) mtx_unlock(&kenv_lock); if (buffer != NULL) { error = copyout(buffer, value, done); free(buffer, M_TEMP); } td->td_retval[0] = ((done == needed) ? 0 : needed); return (error); } int sys_kenv(struct thread *td, struct kenv_args *uap) { char *name, *value; size_t len; int error; KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = false")); error = 0; switch (uap->what) { case KENV_DUMP: #ifdef MAC error = mac_kenv_check_dump(td->td_ucred); if (error) return (error); #endif return (kenv_dump(td, kenvp, uap->what, uap->value, uap->len)); case KENV_DUMP_LOADER: case KENV_DUMP_STATIC: #ifdef MAC error = mac_kenv_check_dump(td->td_ucred); if (error) return (error); #endif #ifdef PRESERVE_EARLY_KENV return (kenv_dump(td, uap->what == KENV_DUMP_LOADER ? (char **)md_envp : (char **)kern_envp, uap->what, uap->value, uap->len)); #else return (ENOENT); #endif case KENV_SET: error = priv_check(td, PRIV_KENV_SET); if (error) return (error); break; case KENV_UNSET: error = priv_check(td, PRIV_KENV_UNSET); if (error) return (error); break; } name = malloc(KENV_MNAMELEN + 1, M_TEMP, M_WAITOK); error = copyinstr(uap->name, name, KENV_MNAMELEN + 1, NULL); if (error) goto done; switch (uap->what) { case KENV_GET: #ifdef MAC error = mac_kenv_check_get(td->td_ucred, name); if (error) goto done; #endif value = kern_getenv(name); if (value == NULL) { error = ENOENT; goto done; } len = strlen(value) + 1; if (len > uap->len) len = uap->len; error = copyout(value, uap->value, len); freeenv(value); if (error) goto done; td->td_retval[0] = len; break; case KENV_SET: len = uap->len; if (len < 1) { error = EINVAL; goto done; } if (len > kenv_mvallen + 1) len = kenv_mvallen + 1; value = malloc(len, M_TEMP, M_WAITOK); error = copyinstr(uap->value, value, len, NULL); if (error) { free(value, M_TEMP); goto done; } #ifdef MAC error = mac_kenv_check_set(td->td_ucred, name, value); if (error == 0) #endif kern_setenv(name, value); free(value, M_TEMP); break; case KENV_UNSET: #ifdef MAC error = mac_kenv_check_unset(td->td_ucred, name); if (error) goto done; #endif error = kern_unsetenv(name); if (error) error = ENOENT; break; default: error = EINVAL; break; } done: free(name, M_TEMP); return (error); } /* * Populate the initial kernel environment. * * This is called very early in MD startup, either to provide a copy of the * environment obtained from a boot loader, or to provide an empty buffer into * which MD code can store an initial environment using kern_setenv() calls. * * kern_envp is set to the static_env generated by config(8). This implements * the env keyword described in config(5). * * If len is non-zero, the caller is providing an empty buffer. The caller will * subsequently use kern_setenv() to add up to len bytes of initial environment * before the dynamic environment is available. * * If len is zero, the caller is providing a pre-loaded buffer containing * environment strings. Additional strings cannot be added until the dynamic * environment is available. The memory pointed to must remain stable at least * until sysinit runs init_dynamic_kenv() and preferably until after SI_SUB_KMEM * is finished so that subr_hints routines may continue to use it until the * environments have been fully merged at the end of the pass. If no initial * environment is available from the boot loader, passing a NULL pointer allows * the static_env to be installed if it is configured. In this case, any call * to kern_setenv() prior to the setup of the dynamic environment will result in * a panic. */ void init_static_kenv(char *buf, size_t len) { KASSERT(!dynamic_kenv, ("kenv: dynamic_kenv already initialized")); /* * Suitably sized means it must be able to hold at least one empty * variable, otherwise things go belly up if a kern_getenv call is * made without a prior call to kern_setenv as we have a malformed * environment. */ KASSERT(len == 0 || len >= 2, ("kenv: static env must be initialized or suitably sized")); KASSERT(len == 0 || (*buf == '\0' && *(buf + 1) == '\0'), ("kenv: sized buffer must be initially empty")); /* * We may be called twice, with the second call needed to relocate * md_envp after enabling paging. md_envp is then garbage if it is * not null and the relocation will move it. Discard it so as to * not crash using its old value in our first call to kern_getenv(). * * The second call gives the same environment as the first except * in silly configurations where the static env disables itself. * * Other env calls don't handle possibly-garbage pointers, so must * not be made between enabling paging and calling here. */ md_envp = NULL; md_env_len = 0; md_env_pos = 0; /* * Give the static environment a chance to disable the loader(8) * environment first. This is done with loader_env.disabled=1. * * static_env and static_hints may both be disabled, but in slightly * different ways. For static_env, we just don't setup kern_envp and * it's as if a static env wasn't even provided. For static_hints, * we effectively zero out the buffer to stop the rest of the kernel * from being able to use it. * * We're intentionally setting this up so that static_hints.disabled may * be specified in either the MD env or the static env. This keeps us * consistent in our new world view. * * As a warning, the static environment may not be disabled in any way * if the static environment has disabled the loader environment. */ kern_envp = static_env; if (!getenv_is_true("loader_env.disabled")) { md_envp = buf; md_env_len = len; md_env_pos = 0; if (getenv_is_true("static_env.disabled")) { kern_envp[0] = '\0'; kern_envp[1] = '\0'; } } if (getenv_is_true("static_hints.disabled")) { static_hints[0] = '\0'; static_hints[1] = '\0'; } } static void init_dynamic_kenv_from(char *init_env, int *curpos) { char *cp, *cpnext, *eqpos, *found; size_t len; int i; if (init_env && *init_env != '\0') { found = NULL; i = *curpos; for (cp = init_env; cp != NULL; cp = cpnext) { cpnext = kernenv_next(cp); len = strlen(cp) + 1; if (len > KENV_MNAMELEN + 1 + kenv_mvallen + 1) { printf( "WARNING: too long kenv string, ignoring %s\n", cp); goto sanitize; } eqpos = strchr(cp, '='); if (eqpos == NULL) { printf( "WARNING: malformed static env value, ignoring %s\n", cp); goto sanitize; } *eqpos = 0; /* * De-dupe the environment as we go. We don't add the * duplicated assignments because config(8) will flip * the order of the static environment around to make * kernel processing match the order of specification * in the kernel config. */ found = _getenv_dynamic_locked(cp, NULL); *eqpos = '='; if (found != NULL) goto sanitize; if (i > KENV_SIZE) { printf( "WARNING: too many kenv strings, ignoring %s\n", cp); goto sanitize; } kenvp[i] = malloc(len, M_KENV, M_WAITOK); strcpy(kenvp[i++], cp); sanitize: #ifdef PRESERVE_EARLY_KENV continue; #else explicit_bzero(cp, len - 1); #endif } *curpos = i; } } /* * Setup the dynamic kernel environment. */ static void init_dynamic_kenv(void *data __unused) { int dynamic_envpos; int size; TUNABLE_INT_FETCH("kenv_mvallen", &kenv_mvallen); size = KENV_MNAMELEN + 1 + kenv_mvallen + 1; kenv_zone = uma_zcreate("kenv", size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV, M_WAITOK | M_ZERO); dynamic_envpos = 0; init_dynamic_kenv_from(md_envp, &dynamic_envpos); init_dynamic_kenv_from(kern_envp, &dynamic_envpos); kenvp[dynamic_envpos] = NULL; mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF); dynamic_kenv = true; } SYSINIT(kenv, SI_SUB_KMEM + 1, SI_ORDER_FIRST, init_dynamic_kenv, NULL); void freeenv(char *env) { if (dynamic_kenv && env != NULL) { explicit_bzero(env, strlen(env)); uma_zfree(kenv_zone, env); } } /* * Internal functions for string lookup. */ static char * _getenv_dynamic_locked(const char *name, int *idx) { char *cp; int len, i; len = strlen(name); for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) { if ((strncmp(cp, name, len) == 0) && (cp[len] == '=')) { if (idx != NULL) *idx = i; return (cp + len + 1); } } return (NULL); } static char * _getenv_dynamic(const char *name, int *idx) { mtx_assert(&kenv_lock, MA_OWNED); return (_getenv_dynamic_locked(name, idx)); } static char * _getenv_static_from(char *chkenv, const char *name) { char *cp, *ep; int len; for (cp = chkenv; cp != NULL; cp = kernenv_next(cp)) { for (ep = cp; (*ep != '=') && (*ep != 0); ep++) ; if (*ep != '=') continue; len = ep - cp; ep++; if (!strncmp(name, cp, len) && name[len] == 0) return (ep); } return (NULL); } static char * _getenv_static(const char *name) { char *val; val = _getenv_static_from(md_envp, name); if (val != NULL) return (val); val = _getenv_static_from(kern_envp, name); if (val != NULL) return (val); return (NULL); } /* * Look up an environment variable by name. * Return a pointer to the string if found. * The pointer has to be freed with freeenv() * after use. */ char * kern_getenv(const char *name) { char *cp, *ret; int len; if (dynamic_kenv) { len = KENV_MNAMELEN + 1 + kenv_mvallen + 1; ret = uma_zalloc(kenv_zone, M_WAITOK | M_ZERO); mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, NULL); if (cp != NULL) strlcpy(ret, cp, len); mtx_unlock(&kenv_lock); if (cp == NULL) { uma_zfree(kenv_zone, ret); ret = NULL; } } else ret = _getenv_static(name); return (ret); } /* * Test if an environment variable is defined. */ int testenv(const char *name) { char *cp; cp = kenv_acquire(name); kenv_release(cp); if (cp != NULL) return (1); return (0); } /* * Set an environment variable in the MD-static environment. This cannot * feasibly be done on config(8)-generated static environments as they don't * generally include space for extra variables. */ static int setenv_static(const char *name, const char *value) { int len; if (md_env_pos >= md_env_len) return (-1); /* Check space for x=y and two nuls */ len = strlen(name) + strlen(value); if (len + 3 < md_env_len - md_env_pos) { len = sprintf(&md_envp[md_env_pos], "%s=%s", name, value); md_env_pos += len+1; md_envp[md_env_pos] = '\0'; return (0); } else return (-1); } /* * Set an environment variable by name. */ int kern_setenv(const char *name, const char *value) { char *buf, *cp, *oldenv; int namelen, vallen, i; if (!dynamic_kenv && md_env_len > 0) return (setenv_static(name, value)); KENV_CHECK; namelen = strlen(name) + 1; if (namelen > KENV_MNAMELEN + 1) return (-1); vallen = strlen(value) + 1; if (vallen > kenv_mvallen + 1) return (-1); buf = malloc(namelen + vallen, M_KENV, M_WAITOK); sprintf(buf, "%s=%s", name, value); mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, &i); if (cp != NULL) { oldenv = kenvp[i]; kenvp[i] = buf; mtx_unlock(&kenv_lock); free(oldenv, M_KENV); } else { /* We add the option if it wasn't found */ for (i = 0; (cp = kenvp[i]) != NULL; i++) ; /* Bounds checking */ if (i < 0 || i >= KENV_SIZE) { free(buf, M_KENV); mtx_unlock(&kenv_lock); return (-1); } kenvp[i] = buf; kenvp[i + 1] = NULL; mtx_unlock(&kenv_lock); } return (0); } /* * Unset an environment variable string. */ int kern_unsetenv(const char *name) { char *cp, *oldenv; int i, j; KENV_CHECK; mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, &i); if (cp != NULL) { oldenv = kenvp[i]; for (j = i + 1; kenvp[j] != NULL; j++) kenvp[i++] = kenvp[j]; kenvp[i] = NULL; mtx_unlock(&kenv_lock); zfree(oldenv, M_KENV); return (0); } mtx_unlock(&kenv_lock); return (-1); } /* * Return the internal kenv buffer for the variable name, if it exists. * If the dynamic kenv is initialized and the name is present, return * with kenv_lock held. */ static char * kenv_acquire(const char *name) { char *value; if (dynamic_kenv) { mtx_lock(&kenv_lock); value = _getenv_dynamic(name, NULL); if (value == NULL) mtx_unlock(&kenv_lock); return (value); } else return (_getenv_static(name)); } /* * Undo a previous kenv_acquire() operation */ static void kenv_release(const char *buf) { if ((buf != NULL) && dynamic_kenv) mtx_unlock(&kenv_lock); } /* * Return a string value from an environment variable. */ int getenv_string(const char *name, char *data, int size) { char *cp; cp = kenv_acquire(name); if (cp != NULL) strlcpy(data, cp, size); kenv_release(cp); return (cp != NULL); } /* * Return an array of integers at the given type size and signedness. */ int getenv_array(const char *name, void *pdata, int size, int *psize, int type_size, bool allow_signed) { uint8_t shift; int64_t value; int64_t old; const char *buf; char *end; const char *ptr; int n; int rc; rc = 0; /* assume failure */ buf = kenv_acquire(name); if (buf == NULL) goto error; /* get maximum number of elements */ size /= type_size; n = 0; for (ptr = buf; *ptr != 0; ) { value = strtoq(ptr, &end, 0); /* check if signed numbers are allowed */ if (value < 0 && !allow_signed) goto error; /* check for invalid value */ if (ptr == end) goto error; /* check for valid suffix */ switch (*end) { case 't': case 'T': shift = 40; end++; break; case 'g': case 'G': shift = 30; end++; break; case 'm': case 'M': shift = 20; end++; break; case 'k': case 'K': shift = 10; end++; break; case ' ': case '\t': case ',': case 0: shift = 0; break; default: /* garbage after numeric value */ goto error; } /* skip till next value, if any */ while (*end == '\t' || *end == ',' || *end == ' ') end++; /* update pointer */ ptr = end; /* apply shift */ old = value; value <<= shift; /* overflow check */ if ((value >> shift) != old) goto error; /* check for buffer overflow */ if (n >= size) goto error; /* store value according to type size */ switch (type_size) { case 1: if (allow_signed) { if (value < SCHAR_MIN || value > SCHAR_MAX) goto error; } else { if (value < 0 || value > UCHAR_MAX) goto error; } ((uint8_t *)pdata)[n] = (uint8_t)value; break; case 2: if (allow_signed) { if (value < SHRT_MIN || value > SHRT_MAX) goto error; } else { if (value < 0 || value > USHRT_MAX) goto error; } ((uint16_t *)pdata)[n] = (uint16_t)value; break; case 4: if (allow_signed) { if (value < INT_MIN || value > INT_MAX) goto error; } else { if (value > UINT_MAX) goto error; } ((uint32_t *)pdata)[n] = (uint32_t)value; break; case 8: ((uint64_t *)pdata)[n] = (uint64_t)value; break; default: goto error; } n++; } *psize = n * type_size; if (n != 0) rc = 1; /* success */ error: kenv_release(buf); return (rc); } /* * Return an integer value from an environment variable. */ int getenv_int(const char *name, int *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (int) tmp; return (rval); } /* * Return an unsigned integer value from an environment variable. */ int getenv_uint(const char *name, unsigned int *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (unsigned int) tmp; return (rval); } /* * Return an int64_t value from an environment variable. */ int getenv_int64(const char *name, int64_t *data) { quad_t tmp; int64_t rval; rval = getenv_quad(name, &tmp); if (rval) *data = (int64_t) tmp; return (rval); } /* * Return an uint64_t value from an environment variable. */ int getenv_uint64(const char *name, uint64_t *data) { quad_t tmp; uint64_t rval; rval = getenv_quad(name, &tmp); if (rval) *data = (uint64_t) tmp; return (rval); } /* * Return a long value from an environment variable. */ int getenv_long(const char *name, long *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (long) tmp; return (rval); } /* * Return an unsigned long value from an environment variable. */ int getenv_ulong(const char *name, unsigned long *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (unsigned long) tmp; return (rval); } /* * Return a quad_t value from an environment variable. */ int getenv_quad(const char *name, quad_t *data) { const char *value; char suffix, *vtp; quad_t iv; value = kenv_acquire(name); if (value == NULL) { goto error; } iv = strtoq(value, &vtp, 0); if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) { goto error; } suffix = vtp[0]; kenv_release(value); switch (suffix) { case 't': case 'T': iv *= 1024; /* FALLTHROUGH */ case 'g': case 'G': iv *= 1024; /* FALLTHROUGH */ case 'm': case 'M': iv *= 1024; /* FALLTHROUGH */ case 'k': case 'K': iv *= 1024; case '\0': break; default: return (0); } *data = iv; return (1); error: kenv_release(value); return (0); } /* * Return a boolean value from an environment variable. This can be in * numerical or string form, i.e. "1" or "true". */ int getenv_bool(const char *name, bool *data) { char *val; int ret = 0; if (name == NULL) return (0); val = kern_getenv(name); if (val == NULL) return (0); if ((strcmp(val, "1") == 0) || (strcasecmp(val, "true") == 0)) { *data = true; ret = 1; } else if ((strcmp(val, "0") == 0) || (strcasecmp(val, "false") == 0)) { *data = false; ret = 1; } else { /* Spit out a warning for malformed boolean variables. */ printf("Environment variable %s has non-boolean value \"%s\"\n", name, val); } freeenv(val); return (ret); } /* * Wrapper around getenv_bool to easily check for true. */ bool getenv_is_true(const char *name) { bool val; if (getenv_bool(name, &val) != 0) return (val); return (false); } /* * Wrapper around getenv_bool to easily check for false. */ bool getenv_is_false(const char *name) { bool val; if (getenv_bool(name, &val) != 0) return (!val); return (false); } /* * Find the next entry after the one which (cp) falls within, return a * pointer to its start or NULL if there are no more. */ static char * kernenv_next(char *cp) { if (cp != NULL) { while (*cp != 0) cp++; cp++; if (*cp == 0) cp = NULL; } return (cp); } void tunable_int_init(void *data) { struct tunable_int *d = (struct tunable_int *)data; TUNABLE_INT_FETCH(d->path, d->var); } void tunable_long_init(void *data) { struct tunable_long *d = (struct tunable_long *)data; TUNABLE_LONG_FETCH(d->path, d->var); } void tunable_ulong_init(void *data) { struct tunable_ulong *d = (struct tunable_ulong *)data; TUNABLE_ULONG_FETCH(d->path, d->var); } void tunable_int64_init(void *data) { struct tunable_int64 *d = (struct tunable_int64 *)data; TUNABLE_INT64_FETCH(d->path, d->var); } void tunable_uint64_init(void *data) { struct tunable_uint64 *d = (struct tunable_uint64 *)data; TUNABLE_UINT64_FETCH(d->path, d->var); } void tunable_quad_init(void *data) { struct tunable_quad *d = (struct tunable_quad *)data; TUNABLE_QUAD_FETCH(d->path, d->var); } void tunable_bool_init(void *data) { struct tunable_bool *d = (struct tunable_bool *)data; TUNABLE_BOOL_FETCH(d->path, d->var); } void tunable_str_init(void *data) { struct tunable_str *d = (struct tunable_str *)data; TUNABLE_STR_FETCH(d->path, d->var, d->size); } diff --git a/sys/kern/kern_ffclock.c b/sys/kern/kern_ffclock.c index e76e2b6652f4..0ab8e35fbe02 100644 --- a/sys/kern/kern_ffclock.c +++ b/sys/kern/kern_ffclock.c @@ -1,486 +1,485 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 The University of Melbourne * All rights reserved. * * This software was developed by Julien Ridoux at the University of Melbourne * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ffclock.h" #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #ifdef FFCLOCK FEATURE(ffclock, "Feed-forward clock support"); extern struct ffclock_estimate ffclock_estimate; extern struct bintime ffclock_boottime; extern int8_t ffclock_updated; extern struct mtx ffclock_mtx; /* * Feed-forward clock absolute time. This should be the preferred way to read * the feed-forward clock for "wall-clock" type time. The flags allow to compose * various flavours of absolute time (e.g. with or without leap seconds taken * into account). If valid pointers are provided, the ffcounter value and an * upper bound on clock error associated with the bintime are provided. * NOTE: use ffclock_convert_abs() to differ the conversion of a ffcounter value * read earlier. */ void ffclock_abstime(ffcounter *ffcount, struct bintime *bt, struct bintime *error_bound, uint32_t flags) { struct ffclock_estimate cest; ffcounter ffc; ffcounter update_ffcount; ffcounter ffdelta_error; /* Get counter and corresponding time. */ if ((flags & FFCLOCK_FAST) == FFCLOCK_FAST) ffclock_last_tick(&ffc, bt, flags); else { ffclock_read_counter(&ffc); ffclock_convert_abs(ffc, bt, flags); } /* Current ffclock estimate, use update_ffcount as generation number. */ do { update_ffcount = ffclock_estimate.update_ffcount; bcopy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate)); } while (update_ffcount != ffclock_estimate.update_ffcount); /* * Leap second adjustment. Total as seen by synchronisation algorithm * since it started. cest.leapsec_next is the ffcounter prediction of * when the next leapsecond occurs. */ if ((flags & FFCLOCK_LEAPSEC) == FFCLOCK_LEAPSEC) { bt->sec -= cest.leapsec_total; if (ffc > cest.leapsec_next) bt->sec -= cest.leapsec; } /* Boot time adjustment, for uptime/monotonic clocks. */ if ((flags & FFCLOCK_UPTIME) == FFCLOCK_UPTIME) { bintime_sub(bt, &ffclock_boottime); } /* Compute error bound if a valid pointer has been passed. */ if (error_bound) { ffdelta_error = ffc - cest.update_ffcount; ffclock_convert_diff(ffdelta_error, error_bound); /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */ bintime_mul(error_bound, cest.errb_rate * (uint64_t)18446744073709LL); /* 18446744073 = int(2^64 / 1e9), since err_abs in [ns] */ bintime_addx(error_bound, cest.errb_abs * (uint64_t)18446744073LL); } if (ffcount) *ffcount = ffc; } /* * Feed-forward difference clock. This should be the preferred way to convert a * time interval in ffcounter values into a time interval in seconds. If a valid * pointer is passed, an upper bound on the error in computing the time interval * in seconds is provided. */ void ffclock_difftime(ffcounter ffdelta, struct bintime *bt, struct bintime *error_bound) { ffcounter update_ffcount; uint32_t err_rate; ffclock_convert_diff(ffdelta, bt); if (error_bound) { do { update_ffcount = ffclock_estimate.update_ffcount; err_rate = ffclock_estimate.errb_rate; } while (update_ffcount != ffclock_estimate.update_ffcount); ffclock_convert_diff(ffdelta, error_bound); /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */ bintime_mul(error_bound, err_rate * (uint64_t)18446744073709LL); } } /* * Create a new kern.sysclock sysctl node, which will be home to some generic * sysclock configuration variables. Feed-forward clock specific variables will * live under the ffclock subnode. */ SYSCTL_NODE(_kern, OID_AUTO, sysclock, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "System clock related configuration"); SYSCTL_NODE(_kern_sysclock, OID_AUTO, ffclock, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Feed-forward clock configuration"); static char *sysclocks[] = {"feedback", "feed-forward"}; #define MAX_SYSCLOCK_NAME_LEN 16 #define NUM_SYSCLOCKS nitems(sysclocks) static int ffclock_version = 2; SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, version, CTLFLAG_RD, &ffclock_version, 0, "Feed-forward clock kernel version"); /* List available sysclocks. */ static int sysctl_kern_sysclock_available(SYSCTL_HANDLER_ARGS) { struct sbuf *s; int clk, error; s = sbuf_new_for_sysctl(NULL, NULL, MAX_SYSCLOCK_NAME_LEN * NUM_SYSCLOCKS, req); if (s == NULL) return (ENOMEM); for (clk = 0; clk < NUM_SYSCLOCKS; clk++) { sbuf_cat(s, sysclocks[clk]); if (clk + 1 < NUM_SYSCLOCKS) sbuf_cat(s, " "); } error = sbuf_finish(s); sbuf_delete(s); return (error); } SYSCTL_PROC(_kern_sysclock, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, sysctl_kern_sysclock_available, "A", "List of available system clocks"); /* * Return the name of the active system clock if read, or attempt to change * the active system clock to the user specified one if written to. The active * system clock is read when calling any of the [get]{bin,nano,micro}[up]time() * functions. */ static int sysctl_kern_sysclock_active(SYSCTL_HANDLER_ARGS) { char newclock[MAX_SYSCLOCK_NAME_LEN]; int error; int clk; /* Return the name of the current active sysclock. */ strlcpy(newclock, sysclocks[sysclock_active], sizeof(newclock)); error = sysctl_handle_string(oidp, newclock, sizeof(newclock), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) goto done; /* Change the active sysclock to the user specified one: */ error = EINVAL; for (clk = 0; clk < NUM_SYSCLOCKS; clk++) { if (strncmp(newclock, sysclocks[clk], MAX_SYSCLOCK_NAME_LEN - 1)) { continue; } sysclock_active = clk; error = 0; break; } done: return (error); } SYSCTL_PROC(_kern_sysclock, OID_AUTO, active, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_kern_sysclock_active, "A", "Name of the active system clock which is currently serving time"); static int sysctl_kern_ffclock_ffcounter_bypass = 0; SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, ffcounter_bypass, CTLFLAG_RW, &sysctl_kern_ffclock_ffcounter_bypass, 0, "Use reliable hardware timecounter as the feed-forward counter"); /* * High level functions to access the Feed-Forward Clock. */ void ffclock_bintime(struct bintime *bt) { ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC); } void ffclock_nanotime(struct timespec *tsp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC); bintime2timespec(&bt, tsp); } void ffclock_microtime(struct timeval *tvp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC); bintime2timeval(&bt, tvp); } void ffclock_getbintime(struct bintime *bt) { ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST); } void ffclock_getnanotime(struct timespec *tsp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST); bintime2timespec(&bt, tsp); } void ffclock_getmicrotime(struct timeval *tvp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST); bintime2timeval(&bt, tvp); } void ffclock_binuptime(struct bintime *bt) { ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME); } void ffclock_nanouptime(struct timespec *tsp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME); bintime2timespec(&bt, tsp); } void ffclock_microuptime(struct timeval *tvp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME); bintime2timeval(&bt, tvp); } void ffclock_getbinuptime(struct bintime *bt) { ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST); } void ffclock_getnanouptime(struct timespec *tsp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST); bintime2timespec(&bt, tsp); } void ffclock_getmicrouptime(struct timeval *tvp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST); bintime2timeval(&bt, tvp); } void ffclock_bindifftime(ffcounter ffdelta, struct bintime *bt) { ffclock_difftime(ffdelta, bt, NULL); } void ffclock_nanodifftime(ffcounter ffdelta, struct timespec *tsp) { struct bintime bt; ffclock_difftime(ffdelta, &bt, NULL); bintime2timespec(&bt, tsp); } void ffclock_microdifftime(ffcounter ffdelta, struct timeval *tvp) { struct bintime bt; ffclock_difftime(ffdelta, &bt, NULL); bintime2timeval(&bt, tvp); } /* * System call allowing userland applications to retrieve the current value of * the Feed-Forward Clock counter. */ #ifndef _SYS_SYSPROTO_H_ struct ffclock_getcounter_args { ffcounter *ffcount; }; #endif /* ARGSUSED */ int sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap) { ffcounter ffcount; int error; ffcount = 0; ffclock_read_counter(&ffcount); if (ffcount == 0) return (EAGAIN); error = copyout(&ffcount, uap->ffcount, sizeof(ffcounter)); return (error); } /* * System call allowing the synchronisation daemon to push new feed-forward clock * estimates to the kernel. Acquire ffclock_mtx to prevent concurrent updates * and ensure data consistency. * NOTE: ffclock_updated signals the fftimehands that new estimates are * available. The updated estimates are picked up by the fftimehands on next * tick, which could take as long as 1/hz seconds (if ticks are not missed). */ #ifndef _SYS_SYSPROTO_H_ struct ffclock_setestimate_args { struct ffclock_estimate *cest; }; #endif /* ARGSUSED */ int sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap) { struct ffclock_estimate cest; int error; /* Reuse of PRIV_CLOCK_SETTIME. */ if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0) return (error); if ((error = copyin(uap->cest, &cest, sizeof(struct ffclock_estimate))) != 0) return (error); mtx_lock(&ffclock_mtx); memcpy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate)); ffclock_updated++; mtx_unlock(&ffclock_mtx); return (error); } /* * System call allowing userland applications to retrieve the clock estimates * stored within the kernel. It is useful to kickstart the synchronisation * daemon with the kernel's knowledge of hardware timecounter. */ #ifndef _SYS_SYSPROTO_H_ struct ffclock_getestimate_args { struct ffclock_estimate *cest; }; #endif /* ARGSUSED */ int sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap) { struct ffclock_estimate cest; int error; mtx_lock(&ffclock_mtx); memcpy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate)); mtx_unlock(&ffclock_mtx); error = copyout(&cest, uap->cest, sizeof(struct ffclock_estimate)); return (error); } #else /* !FFCLOCK */ int sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap) { return (ENOSYS); } int sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap) { return (ENOSYS); } int sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap) { return (ENOSYS); } #endif /* FFCLOCK */ diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c index e3c82ca5f432..1617e4f0d5b4 100644 --- a/sys/kern/kern_linker.c +++ b/sys/kern/kern_linker.c @@ -1,2338 +1,2337 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kld.h" #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #ifdef DDB #include #endif #include #include #include "linker_if.h" #ifdef HWPMC_HOOKS #include #endif #ifdef KLD_DEBUG int kld_debug = 0; SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RWTUN, &kld_debug, 0, "Set various levels of KLD debug"); #endif /* These variables are used by kernel debuggers to enumerate loaded files. */ const int kld_off_address = offsetof(struct linker_file, address); const int kld_off_filename = offsetof(struct linker_file, filename); const int kld_off_pathname = offsetof(struct linker_file, pathname); const int kld_off_next = offsetof(struct linker_file, link.tqe_next); /* * static char *linker_search_path(const char *name, struct mod_depend * *verinfo); */ static const char *linker_basename(const char *path); /* * Find a currently loaded file given its filename. */ static linker_file_t linker_find_file_by_name(const char* _filename); /* * Find a currently loaded file given its file id. */ static linker_file_t linker_find_file_by_id(int _fileid); /* Metadata from the static kernel */ SET_DECLARE(modmetadata_set, struct mod_metadata); MALLOC_DEFINE(M_LINKER, "linker", "kernel linker"); linker_file_t linker_kernel_file; static struct sx kld_sx; /* kernel linker lock */ static u_int kld_busy; static struct thread *kld_busy_owner; /* * Load counter used by clients to determine if a linker file has been * re-loaded. This counter is incremented for each file load. */ static int loadcnt; static linker_class_list_t classes; static linker_file_list_t linker_files; static int next_file_id = 1; static int linker_no_more_classes = 0; #define LINKER_GET_NEXT_FILE_ID(a) do { \ linker_file_t lftmp; \ \ if (!cold) \ sx_assert(&kld_sx, SA_XLOCKED); \ retry: \ TAILQ_FOREACH(lftmp, &linker_files, link) { \ if (next_file_id == lftmp->id) { \ next_file_id++; \ goto retry; \ } \ } \ (a) = next_file_id; \ } while(0) /* XXX wrong name; we're looking at version provision tags here, not modules */ typedef TAILQ_HEAD(, modlist) modlisthead_t; struct modlist { TAILQ_ENTRY(modlist) link; /* chain together all modules */ linker_file_t container; const char *name; int version; }; typedef struct modlist *modlist_t; static modlisthead_t found_modules; static int linker_file_add_dependency(linker_file_t file, linker_file_t dep); static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char* name, int deps); static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp); static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo); static void linker_init(void *arg) { sx_init(&kld_sx, "kernel linker"); TAILQ_INIT(&classes); TAILQ_INIT(&linker_files); } SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, NULL); static void linker_stop_class_add(void *arg) { linker_no_more_classes = 1; } SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL); int linker_add_class(linker_class_t lc) { /* * We disallow any class registration past SI_ORDER_ANY * of SI_SUB_KLD. We bump the reference count to keep the * ops from being freed. */ if (linker_no_more_classes == 1) return (EPERM); kobj_class_compile((kobj_class_t) lc); ((kobj_class_t)lc)->refs++; /* XXX: kobj_mtx */ TAILQ_INSERT_TAIL(&classes, lc, link); return (0); } static void linker_file_sysinit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0) return; /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_sysuninit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop, NULL) != 0) return; /* * Perform a reverse bubble sort of the system initialization objects * by their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem > (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order >= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_register_sysctls(linker_file_t lf, bool enable) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_register_sysctls: registering SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) { if (enable) sysctl_register_oid(*oidp); else sysctl_register_disabled_oid(*oidp); } sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_enable_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_enable_sysctls: enable SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_enable_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_unregister_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_unregister_sysctls: unregistering SYSCTLs" " for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_unregister_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static int linker_file_register_modules(linker_file_t lf) { struct mod_metadata **start, **stop, **mdp; const moduledata_t *moddata; int first_error, error; KLD_DPF(FILE, ("linker_file_register_modules: registering modules" " in %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL) != 0) { /* * This fallback should be unnecessary, but if we get booted * from boot2 instead of loader and we are missing our * metadata then we have to try the best we can. */ if (lf == linker_kernel_file) { start = SET_BEGIN(modmetadata_set); stop = SET_LIMIT(modmetadata_set); } else return (0); } first_error = 0; for (mdp = start; mdp < stop; mdp++) { if ((*mdp)->md_type != MDT_MODULE) continue; moddata = (*mdp)->md_data; KLD_DPF(FILE, ("Registering module %s in %s\n", moddata->name, lf->filename)); error = module_register(moddata, lf); if (error) { printf("Module %s failed to register: %d\n", moddata->name, error); if (first_error == 0) first_error = error; } } return (first_error); } static void linker_init_kernel_modules(void) { sx_xlock(&kld_sx); linker_file_register_modules(linker_kernel_file); sx_xunlock(&kld_sx); } SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, NULL); static int linker_load_file(const char *filename, linker_file_t *result) { linker_class_t lc; linker_file_t lf; int foundfile, error, modules; /* Refuse to load modules if securelevel raised */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); lf = linker_find_file_by_name(filename); if (lf) { KLD_DPF(FILE, ("linker_load_file: file %s is already loaded," " incrementing refs\n", filename)); *result = lf; lf->refs++; return (0); } foundfile = 0; error = 0; /* * We do not need to protect (lock) classes here because there is * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY) * and there is no class deregistration mechanism at this time. */ TAILQ_FOREACH(lc, &classes, link) { KLD_DPF(FILE, ("linker_load_file: trying to load %s\n", filename)); error = LINKER_LOAD_FILE(lc, filename, &lf); /* * If we got something other than ENOENT, then it exists but * we cannot load it for some other reason. */ if (error != ENOENT) foundfile = 1; if (lf) { error = linker_file_register_modules(lf); if (error == EEXIST) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } modules = !TAILQ_EMPTY(&lf->modules); linker_file_register_sysctls(lf, false); linker_file_sysinit(lf); lf->flags |= LINKER_FILE_LINKED; /* * If all of the modules in this file failed * to load, unload the file and return an * error of ENOEXEC. */ if (modules && TAILQ_EMPTY(&lf->modules)) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (ENOEXEC); } linker_file_enable_sysctls(lf); EVENTHANDLER_INVOKE(kld_load, lf); *result = lf; return (0); } } /* * Less than ideal, but tells the user whether it failed to load or * the module was not found. */ if (foundfile) { /* * If the file type has not been recognized by the last try * printout a message before to fail. */ if (error == ENOSYS) printf("%s: %s - unsupported file type\n", __func__, filename); /* * Format not recognized or otherwise unloadable. * When loading a module that is statically built into * the kernel EEXIST percolates back up as the return * value. Preserve this so that apps like sysinstall * can recognize this special case and not post bogus * dialog boxes. */ if (error != EEXIST) error = ENOEXEC; } else error = ENOENT; /* Nothing found */ return (error); } int linker_reference_module(const char *modname, struct mod_depend *verinfo, linker_file_t *result) { modlist_t mod; int error; sx_xlock(&kld_sx); if ((mod = modlist_lookup2(modname, verinfo)) != NULL) { *result = mod->container; (*result)->refs++; sx_xunlock(&kld_sx); return (0); } error = linker_load_module(NULL, modname, NULL, verinfo, result); sx_xunlock(&kld_sx); return (error); } int linker_release_module(const char *modname, struct mod_depend *verinfo, linker_file_t lf) { modlist_t mod; int error; sx_xlock(&kld_sx); if (lf == NULL) { KASSERT(modname != NULL, ("linker_release_module: no file or name")); mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { sx_xunlock(&kld_sx); return (ESRCH); } lf = mod->container; } else KASSERT(modname == NULL && verinfo == NULL, ("linker_release_module: both file and name")); error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL); sx_xunlock(&kld_sx); return (error); } static linker_file_t linker_find_file_by_name(const char *filename) { linker_file_t lf; char *koname; koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK); sprintf(koname, "%s.ko", filename); sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) { if (strcmp(lf->filename, koname) == 0) break; if (strcmp(lf->filename, filename) == 0) break; } free(koname, M_LINKER); return (lf); } static linker_file_t linker_find_file_by_id(int fileid) { linker_file_t lf; sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED) break; return (lf); } int linker_file_foreach(linker_predicate_t *predicate, void *context) { linker_file_t lf; int retval = 0; sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { retval = predicate(lf, context); if (retval != 0) break; } sx_xunlock(&kld_sx); return (retval); } linker_file_t linker_make_file(const char *pathname, linker_class_t lc) { linker_file_t lf; const char *filename; if (!cold) sx_assert(&kld_sx, SA_XLOCKED); filename = linker_basename(pathname); KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname)); lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK); if (lf == NULL) return (NULL); lf->ctors_addr = 0; lf->ctors_size = 0; lf->dtors_addr = 0; lf->dtors_size = 0; lf->refs = 1; lf->userrefs = 0; lf->flags = 0; lf->filename = strdup(filename, M_LINKER); lf->pathname = strdup(pathname, M_LINKER); LINKER_GET_NEXT_FILE_ID(lf->id); lf->ndeps = 0; lf->deps = NULL; lf->loadcnt = ++loadcnt; #ifdef __arm__ lf->exidx_addr = 0; lf->exidx_size = 0; #endif STAILQ_INIT(&lf->common); TAILQ_INIT(&lf->modules); TAILQ_INSERT_TAIL(&linker_files, lf, link); return (lf); } int linker_file_unload(linker_file_t file, int flags) { module_t mod, next; modlist_t ml, nextml; struct common_symbol *cp; int error, i; /* Refuse to unload modules if securelevel raised. */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs)); /* Easy case of just dropping a reference. */ if (file->refs > 1) { file->refs--; return (0); } /* Give eventhandlers a chance to prevent the unload. */ error = 0; EVENTHANDLER_INVOKE(kld_unload_try, file, &error); if (error != 0) return (EBUSY); KLD_DPF(FILE, ("linker_file_unload: file is unloading," " informing modules\n")); /* * Quiesce all the modules to give them a chance to veto the unload. */ MOD_SLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = module_getfnext(mod)) { error = module_quiesce(mod); if (error != 0 && flags != LINKER_UNLOAD_FORCE) { KLD_DPF(FILE, ("linker_file_unload: module %s" " vetoed unload\n", module_getname(mod))); /* * XXX: Do we need to tell all the quiesced modules * that they can resume work now via a new module * event? */ MOD_SUNLOCK; return (error); } } MOD_SUNLOCK; /* * Inform any modules associated with this file that they are * being unloaded. */ MOD_XLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) { next = module_getfnext(mod); MOD_XUNLOCK; /* * Give the module a chance to veto the unload. */ if ((error = module_unload(mod)) != 0) { #ifdef KLD_DEBUG MOD_SLOCK; KLD_DPF(FILE, ("linker_file_unload: module %s" " failed unload\n", module_getname(mod))); MOD_SUNLOCK; #endif return (error); } MOD_XLOCK; module_release(mod); } MOD_XUNLOCK; TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) { if (ml->container == file) { TAILQ_REMOVE(&found_modules, ml, link); free(ml, M_LINKER); } } /* * Don't try to run SYSUNINITs if we are unloaded due to a * link error. */ if (file->flags & LINKER_FILE_LINKED) { file->flags &= ~LINKER_FILE_LINKED; linker_file_unregister_sysctls(file); linker_file_sysuninit(file); } TAILQ_REMOVE(&linker_files, file, link); if (file->deps) { for (i = 0; i < file->ndeps; i++) linker_file_unload(file->deps[i], flags); free(file->deps, M_LINKER); file->deps = NULL; } while ((cp = STAILQ_FIRST(&file->common)) != NULL) { STAILQ_REMOVE_HEAD(&file->common, link); free(cp, M_LINKER); } LINKER_UNLOAD(file); EVENTHANDLER_INVOKE(kld_unload, file->filename, file->address, file->size); if (file->filename) { free(file->filename, M_LINKER); file->filename = NULL; } if (file->pathname) { free(file->pathname, M_LINKER); file->pathname = NULL; } kobj_delete((kobj_t) file, M_LINKER); return (0); } int linker_ctf_get(linker_file_t file, linker_ctf_t *lc) { return (LINKER_CTF_GET(file, lc)); } static int linker_file_add_dependency(linker_file_t file, linker_file_t dep) { linker_file_t *newdeps; sx_assert(&kld_sx, SA_XLOCKED); file->deps = realloc(file->deps, (file->ndeps + 1) * sizeof(*newdeps), M_LINKER, M_WAITOK | M_ZERO); file->deps[file->ndeps] = dep; file->ndeps++; KLD_DPF(FILE, ("linker_file_add_dependency:" " adding %s as dependency for %s\n", dep->filename, file->filename)); return (0); } /* * Locate a linker set and its contents. This is a helper function to avoid * linker_if.h exposure elsewhere. Note: firstp and lastp are really void **. * This function is used in this file so we can avoid having lots of (void **) * casts. */ int linker_file_lookup_set(linker_file_t file, const char *name, void *firstp, void *lastp, int *countp) { sx_assert(&kld_sx, SA_LOCKED); return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp)); } /* * List all functions in a file. */ int linker_file_function_listall(linker_file_t lf, linker_function_nameval_callback_t callback_func, void *arg) { return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg)); } caddr_t linker_file_lookup_symbol(linker_file_t file, const char *name, int deps) { caddr_t sym; int locked; locked = sx_xlocked(&kld_sx); if (!locked) sx_xlock(&kld_sx); sym = linker_file_lookup_symbol_internal(file, name, deps); if (!locked) sx_xunlock(&kld_sx); return (sym); } static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char *name, int deps) { c_linker_sym_t sym; linker_symval_t symval; caddr_t address; size_t common_size = 0; int i; sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n", file, name, deps)); if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) { LINKER_SYMBOL_VALUES(file, sym, &symval); if (symval.value == 0) /* * For commons, first look them up in the * dependencies and only allocate space if not found * there. */ common_size = symval.size; else { KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol" ".value=%p\n", symval.value)); return (symval.value); } } if (deps) { for (i = 0; i < file->ndeps; i++) { address = linker_file_lookup_symbol_internal( file->deps[i], name, 0); if (address) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " deps value=%p\n", address)); return (address); } } } if (common_size > 0) { /* * This is a common symbol which was not found in the * dependencies. We maintain a simple common symbol table in * the file object. */ struct common_symbol *cp; STAILQ_FOREACH(cp, &file->common, link) { if (strcmp(cp->name, name) == 0) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " old common value=%p\n", cp->address)); return (cp->address); } } /* * Round the symbol size up to align. */ common_size = (common_size + sizeof(int) - 1) & -sizeof(int); cp = malloc(sizeof(struct common_symbol) + common_size + strlen(name) + 1, M_LINKER, M_WAITOK | M_ZERO); cp->address = (caddr_t)(cp + 1); cp->name = cp->address + common_size; strcpy(cp->name, name); bzero(cp->address, common_size); STAILQ_INSERT_TAIL(&file->common, cp, link); KLD_DPF(SYM, ("linker_file_lookup_symbol: new common" " value=%p\n", cp->address)); return (cp->address); } KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n")); return (0); } /* * Both DDB and stack(9) rely on the kernel linker to provide forward and * backward lookup of symbols. However, DDB and sometimes stack(9) need to * do this in a lockfree manner. We provide a set of internal helper * routines to perform these operations without locks, and then wrappers that * optionally lock. * * linker_debug_lookup() is ifdef DDB as currently it's only used by DDB. */ #ifdef DDB static int linker_debug_lookup(const char *symstr, c_linker_sym_t *sym) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_DEBUG_SYMBOL(lf, symstr, sym) == 0) return (0); } return (ENOENT); } #endif static int linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { linker_file_t lf; c_linker_sym_t best, es; u_long diff, bestdiff, off; best = 0; off = (uintptr_t)value; bestdiff = off; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0) continue; if (es != 0 && diff < bestdiff) { best = es; bestdiff = diff; } if (bestdiff == 0) break; } if (best) { *sym = best; *diffp = bestdiff; return (0); } else { *sym = 0; *diffp = off; return (ENOENT); } } static int linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_DEBUG_SYMBOL_VALUES(lf, sym, symval) == 0) return (0); } return (ENOENT); } static int linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { linker_symval_t symval; c_linker_sym_t sym; int error; *offset = 0; error = linker_debug_search_symbol(value, &sym, offset); if (error) return (error); error = linker_debug_symbol_values(sym, &symval); if (error) return (error); strlcpy(buf, symval.name, buflen); return (0); } /* * DDB Helpers. DDB has to look across multiple files with their own symbol * tables and string tables. * * Note that we do not obey list locking protocols here. We really don't need * DDB to hang because somebody's got the lock held. We'll take the chance * that the files list is inconsistent instead. */ #ifdef DDB int linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym) { return (linker_debug_lookup(symstr, sym)); } #endif int linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { return (linker_debug_search_symbol(value, sym, diffp)); } int linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { return (linker_debug_symbol_values(sym, symval)); } int linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { return (linker_debug_search_symbol_name(value, buf, buflen, offset)); } /* * stack(9) helper for non-debugging environemnts. Unlike DDB helpers, we do * obey locking protocols, and offer a significantly less complex interface. */ int linker_search_symbol_name_flags(caddr_t value, char *buf, u_int buflen, long *offset, int flags) { int error; KASSERT((flags & (M_NOWAIT | M_WAITOK)) != 0 && (flags & (M_NOWAIT | M_WAITOK)) != (M_NOWAIT | M_WAITOK), ("%s: bad flags: 0x%x", __func__, flags)); if (flags & M_NOWAIT) { if (!sx_try_slock(&kld_sx)) return (EWOULDBLOCK); } else sx_slock(&kld_sx); error = linker_debug_search_symbol_name(value, buf, buflen, offset); sx_sunlock(&kld_sx); return (error); } int linker_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { return (linker_search_symbol_name_flags(value, buf, buflen, offset, M_WAITOK)); } int linker_kldload_busy(int flags) { int error; MPASS((flags & ~(LINKER_UB_UNLOCK | LINKER_UB_LOCKED | LINKER_UB_PCATCH)) == 0); if ((flags & LINKER_UB_LOCKED) != 0) sx_assert(&kld_sx, SA_XLOCKED); if ((flags & LINKER_UB_LOCKED) == 0) sx_xlock(&kld_sx); while (kld_busy > 0) { if (kld_busy_owner == curthread) break; error = sx_sleep(&kld_busy, &kld_sx, (flags & LINKER_UB_PCATCH) != 0 ? PCATCH : 0, "kldbusy", 0); if (error != 0) { if ((flags & LINKER_UB_UNLOCK) != 0) sx_xunlock(&kld_sx); return (error); } } kld_busy++; kld_busy_owner = curthread; if ((flags & LINKER_UB_UNLOCK) != 0) sx_xunlock(&kld_sx); return (0); } void linker_kldload_unbusy(int flags) { MPASS((flags & ~LINKER_UB_LOCKED) == 0); if ((flags & LINKER_UB_LOCKED) != 0) sx_assert(&kld_sx, SA_XLOCKED); if ((flags & LINKER_UB_LOCKED) == 0) sx_xlock(&kld_sx); MPASS(kld_busy > 0); if (kld_busy_owner != curthread) panic("linker_kldload_unbusy done by not owning thread %p", kld_busy_owner); kld_busy--; if (kld_busy == 0) { kld_busy_owner = NULL; wakeup(&kld_busy); } sx_xunlock(&kld_sx); } /* * Syscalls. */ int kern_kldload(struct thread *td, const char *file, int *fileid) { const char *kldname, *modname; linker_file_t lf; int error; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface * name. */ if (strchr(file, '/') || strchr(file, '.')) { kldname = file; modname = NULL; } else { kldname = NULL; modname = file; } error = linker_kldload_busy(LINKER_UB_PCATCH); if (error != 0) { sx_xunlock(&kld_sx); return (error); } /* * It is possible that kldloaded module will attach a new ifnet, * so vnet context must be set when this ocurs. */ CURVNET_SET(TD_TO_VNET(td)); error = linker_load_module(kldname, modname, NULL, NULL, &lf); CURVNET_RESTORE(); if (error == 0) { lf->userrefs++; if (fileid != NULL) *fileid = lf->id; } linker_kldload_unbusy(LINKER_UB_LOCKED); return (error); } int sys_kldload(struct thread *td, struct kldload_args *uap) { char *pathname = NULL; int error, fileid; td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL); if (error == 0) { error = kern_kldload(td, pathname, &fileid); if (error == 0) td->td_retval[0] = fileid; } free(pathname, M_TEMP); return (error); } int kern_kldunload(struct thread *td, int fileid, int flags) { linker_file_t lf; int error = 0; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); error = linker_kldload_busy(LINKER_UB_PCATCH); if (error != 0) { sx_xunlock(&kld_sx); return (error); } CURVNET_SET(TD_TO_VNET(td)); lf = linker_find_file_by_id(fileid); if (lf) { KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs)); if (lf->userrefs == 0) { /* * XXX: maybe LINKER_UNLOAD_FORCE should override ? */ printf("kldunload: attempt to unload file that was" " loaded by the kernel\n"); error = EBUSY; } else { lf->userrefs--; error = linker_file_unload(lf, flags); if (error) lf->userrefs++; } } else error = ENOENT; CURVNET_RESTORE(); linker_kldload_unbusy(LINKER_UB_LOCKED); return (error); } int sys_kldunload(struct thread *td, struct kldunload_args *uap) { return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL)); } int sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap) { if (uap->flags != LINKER_UNLOAD_NORMAL && uap->flags != LINKER_UNLOAD_FORCE) return (EINVAL); return (kern_kldunload(td, uap->fileid, uap->flags)); } int sys_kldfind(struct thread *td, struct kldfind_args *uap) { char *pathname; const char *filename; linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0) goto out; filename = linker_basename(pathname); sx_xlock(&kld_sx); lf = linker_find_file_by_name(filename); if (lf) td->td_retval[0] = lf->id; else error = ENOENT; sx_xunlock(&kld_sx); out: free(pathname, M_TEMP); return (error); } int sys_kldnext(struct thread *td, struct kldnext_args *uap) { linker_file_t lf; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); if (uap->fileid == 0) lf = TAILQ_FIRST(&linker_files); else { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) { error = ENOENT; goto out; } lf = TAILQ_NEXT(lf, link); } /* Skip partially loaded files. */ while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED)) lf = TAILQ_NEXT(lf, link); if (lf) td->td_retval[0] = lf->id; else td->td_retval[0] = 0; out: sx_xunlock(&kld_sx); return (error); } int sys_kldstat(struct thread *td, struct kldstat_args *uap) { struct kld_file_stat *stat; int error, version; /* * Check the version of the user's structure. */ if ((error = copyin(&uap->stat->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct kld_file_stat_1) && version != sizeof(struct kld_file_stat)) return (EINVAL); stat = malloc(sizeof(*stat), M_TEMP, M_WAITOK | M_ZERO); error = kern_kldstat(td, uap->fileid, stat); if (error == 0) error = copyout(stat, uap->stat, version); free(stat, M_TEMP); return (error); } int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat) { linker_file_t lf; int namelen; #ifdef MAC int error; error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(fileid); if (lf == NULL) { sx_xunlock(&kld_sx); return (ENOENT); } /* Version 1 fields: */ namelen = strlen(lf->filename) + 1; if (namelen > sizeof(stat->name)) namelen = sizeof(stat->name); bcopy(lf->filename, &stat->name[0], namelen); stat->refs = lf->refs; stat->id = lf->id; stat->address = lf->address; stat->size = lf->size; /* Version 2 fields: */ namelen = strlen(lf->pathname) + 1; if (namelen > sizeof(stat->pathname)) namelen = sizeof(stat->pathname); bcopy(lf->pathname, &stat->pathname[0], namelen); sx_xunlock(&kld_sx); td->td_retval[0] = 0; return (0); } #ifdef DDB DB_COMMAND(kldstat, db_kldstat) { linker_file_t lf; #define POINTER_WIDTH ((int)(sizeof(void *) * 2 + 2)) db_printf("Id Refs Address%*c Size Name\n", POINTER_WIDTH - 7, ' '); #undef POINTER_WIDTH TAILQ_FOREACH(lf, &linker_files, link) { if (db_pager_quit) return; db_printf("%2d %4d %p %-8zx %s\n", lf->id, lf->refs, lf->address, lf->size, lf->filename); } } #endif /* DDB */ int sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap) { linker_file_t lf; module_t mp; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(uap->fileid); if (lf) { MOD_SLOCK; mp = TAILQ_FIRST(&lf->modules); if (mp != NULL) td->td_retval[0] = module_getid(mp); else td->td_retval[0] = 0; MOD_SUNLOCK; } else error = ENOENT; sx_xunlock(&kld_sx); return (error); } int sys_kldsym(struct thread *td, struct kldsym_args *uap) { char *symstr = NULL; c_linker_sym_t sym; linker_symval_t symval; linker_file_t lf; struct kld_sym_lookup lookup; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0) return (error); if (lookup.version != sizeof(lookup) || uap->cmd != KLDSYM_LOOKUP) return (EINVAL); symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0) goto out; sx_xlock(&kld_sx); if (uap->fileid != 0) { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) error = ENOENT; else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t) symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); } else error = ENOENT; } else { TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t)symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); break; } } if (lf == NULL) error = ENOENT; } sx_xunlock(&kld_sx); out: free(symstr, M_TEMP); return (error); } /* * Preloaded module support */ static modlist_t modlist_lookup(const char *name, int ver) { modlist_t mod; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) == 0 && (ver == 0 || mod->version == ver)) return (mod); } return (NULL); } static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo) { modlist_t mod, bestmod; int ver; if (verinfo == NULL) return (modlist_lookup(name, 0)); bestmod = NULL; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) != 0) continue; ver = mod->version; if (ver == verinfo->md_ver_preferred) return (mod); if (ver >= verinfo->md_ver_minimum && ver <= verinfo->md_ver_maximum && (bestmod == NULL || ver > bestmod->version)) bestmod = mod; } return (bestmod); } static modlist_t modlist_newmodule(const char *modname, int version, linker_file_t container) { modlist_t mod; mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO); if (mod == NULL) panic("no memory for module list"); mod->container = container; mod->name = modname; mod->version = version; TAILQ_INSERT_TAIL(&found_modules, mod, link); return (mod); } static void linker_addmodules(linker_file_t lf, struct mod_metadata **start, struct mod_metadata **stop, int preload) { struct mod_metadata *mp, **mdp; const char *modname; int ver; for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; if (modlist_lookup(modname, ver) != NULL) { printf("module %s already present!\n", modname); /* XXX what can we do? this is a build error. :-( */ continue; } modlist_newmodule(modname, ver, lf); } } static void linker_preload(void *arg) { caddr_t modptr; const char *modname, *nmodname; char *modtype; linker_file_t lf, nlf; linker_class_t lc; int error; linker_file_list_t loaded_files; linker_file_list_t depended_files; struct mod_metadata *mp, *nmp; struct mod_metadata **start, **stop, **mdp, **nmdp; const struct mod_depend *verinfo; int nver; int resolves; modlist_t mod; struct sysinit **si_start, **si_stop; TAILQ_INIT(&loaded_files); TAILQ_INIT(&depended_files); TAILQ_INIT(&found_modules); error = 0; modptr = NULL; sx_xlock(&kld_sx); while ((modptr = preload_search_next_name(modptr)) != NULL) { modname = (char *)preload_search_info(modptr, MODINFO_NAME); modtype = (char *)preload_search_info(modptr, MODINFO_TYPE); if (modname == NULL) { printf("Preloaded module at %p does not have a" " name!\n", modptr); continue; } if (modtype == NULL) { printf("Preloaded module at %p does not have a type!\n", modptr); continue; } if (bootverbose) printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, modptr); lf = NULL; TAILQ_FOREACH(lc, &classes, link) { error = LINKER_LINK_PRELOAD(lc, modname, &lf); if (!error) break; lf = NULL; } if (lf) TAILQ_INSERT_TAIL(&loaded_files, lf, loaded); } /* * First get a list of stuff in the kernel. */ if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start, &stop, NULL) == 0) linker_addmodules(linker_kernel_file, start, stop, 1); /* * This is a once-off kinky bubble sort to resolve relocation * dependency requirements. */ restart: TAILQ_FOREACH(lf, &loaded_files, loaded) { error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); /* * First, look to see if we would successfully link with this * stuff. */ resolves = 1; /* unless we know otherwise */ if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop) /* it's a self reference */ continue; /* * ok, the module isn't here yet, we * are not finished */ if (modlist_lookup2(modname, verinfo) == NULL) resolves = 0; } } /* * OK, if we found our modules, we can link. So, "provide" * the modules inside and add it to the end of the link order * list. */ if (resolves) { if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; nver = ((const struct mod_version *) mp->md_data)->mv_version; if (modlist_lookup(modname, nver) != NULL) { printf("module %s already" " present!\n", modname); TAILQ_REMOVE(&loaded_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); /* we changed tailq next ptr */ goto restart; } modlist_newmodule(modname, nver, lf); } } TAILQ_REMOVE(&loaded_files, lf, loaded); TAILQ_INSERT_TAIL(&depended_files, lf, loaded); /* * Since we provided modules, we need to restart the * sort so that the previous files that depend on us * have a chance. Also, we've busted the tailq next * pointer with the REMOVE. */ goto restart; } } /* * At this point, we check to see what could not be resolved.. */ while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) { TAILQ_REMOVE(&loaded_files, lf, loaded); printf("KLD file %s is missing dependencies\n", lf->filename); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } /* * We made it. Finish off the linking in the order we determined. */ TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) { if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) panic("cannot add dependency"); } error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { printf("KLD file %s - cannot find " "dependency \"%s\"\n", lf->filename, modname); goto fail; } /* Don't count self-dependencies */ if (lf == mod->container) continue; mod->container->refs++; error = linker_file_add_dependency(lf, mod->container); if (error) panic("cannot add dependency"); } } /* * Now do relocation etc using the symbol search paths * established by the dependencies */ error = LINKER_LINK_PRELOAD_FINISH(lf); if (error) { printf("KLD file %s - could not finalize loading\n", lf->filename); goto fail; } linker_file_register_modules(lf); if (!TAILQ_EMPTY(&lf->modules)) lf->flags |= LINKER_FILE_MODULES; if (linker_file_lookup_set(lf, "sysinit_set", &si_start, &si_stop, NULL) == 0) sysinit_add(si_start, si_stop); linker_file_register_sysctls(lf, true); lf->flags |= LINKER_FILE_LINKED; continue; fail: TAILQ_REMOVE(&depended_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } sx_xunlock(&kld_sx); /* woohoo! we made it! */ } SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, NULL); /* * Handle preload files that failed to load any modules. */ static void linker_preload_finish(void *arg) { linker_file_t lf, nlf; sx_xlock(&kld_sx); TAILQ_FOREACH_SAFE(lf, &linker_files, link, nlf) { /* * If all of the modules in this file failed to load, unload * the file and return an error of ENOEXEC. (Parity with * linker_load_file.) */ if ((lf->flags & LINKER_FILE_MODULES) != 0 && TAILQ_EMPTY(&lf->modules)) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); continue; } lf->flags &= ~LINKER_FILE_MODULES; lf->userrefs++; /* so we can (try to) kldunload it */ } sx_xunlock(&kld_sx); } /* * Attempt to run after all DECLARE_MODULE SYSINITs. Unfortunately they can be * scheduled at any subsystem and order, so run this as late as possible. init * becomes runnable in SI_SUB_KTHREAD_INIT, so go slightly before that. */ SYSINIT(preload_finish, SI_SUB_KTHREAD_INIT - 100, SI_ORDER_MIDDLE, linker_preload_finish, NULL); /* * Search for a not-loaded module by name. * * Modules may be found in the following locations: * * - preloaded (result is just the module name) - on disk (result is full path * to module) * * If the module name is qualified in any way (contains path, etc.) the we * simply return a copy of it. * * The search path can be manipulated via sysctl. Note that we use the ';' * character as a separator to be consistent with the bootloader. */ static char linker_hintfile[] = "linker.hints"; static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules"; SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RWTUN, linker_path, sizeof(linker_path), "module load search path"); TUNABLE_STR("module_path", linker_path, sizeof(linker_path)); static const char * const linker_ext_list[] = { "", ".ko", NULL }; /* * Check if file actually exists either with or without extension listed in * the linker_ext_list. (probably should be generic for the rest of the * kernel) */ static char * linker_lookup_file(const char *path, int pathlen, const char *name, int namelen, struct vattr *vap) { struct nameidata nd; struct thread *td = curthread; /* XXX */ const char * const *cpp, *sep; char *result; int error, len, extlen, reclen, flags; enum vtype type; extlen = 0; for (cpp = linker_ext_list; *cpp; cpp++) { len = strlen(*cpp); if (len > extlen) extlen = len; } extlen++; /* trailing '\0' */ sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = pathlen + strlen(sep) + namelen + extlen + 1; result = malloc(reclen, M_LINKER, M_WAITOK); for (cpp = linker_ext_list; *cpp; cpp++) { snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep, namelen, name, *cpp); /* * Attempt to open the file, and return the path if * we succeed and it's a regular file. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); type = nd.ni_vp->v_type; if (vap) VOP_GETATTR(nd.ni_vp, vap, td->td_ucred); VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (type == VREG) return (result); } } free(result, M_LINKER); return (NULL); } #define INT_ALIGN(base, ptr) ptr = \ (base) + roundup2((ptr) - (base), sizeof(int)) /* * Lookup KLD which contains requested module in the "linker.hints" file. If * version specification is available, then try to find the best KLD. * Otherwise just find the latest one. */ static char * linker_hints_lookup(const char *path, int pathlen, const char *modname, int modnamelen, const struct mod_depend *verinfo) { struct thread *td = curthread; /* XXX */ struct ucred *cred = td ? td->td_ucred : NULL; struct nameidata nd; struct vattr vattr, mattr; const char *best, *sep; u_char *hints = NULL; u_char *cp, *recptr, *bufend, *result, *pathbuf; int error, ival, bestver, *intp, found, flags, clen, blen; ssize_t reclen; result = NULL; bestver = found = 0; sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen + strlen(sep) + 1; pathbuf = malloc(reclen, M_LINKER, M_WAITOK); snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep, linker_hintfile); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) goto bad; NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) goto bad; best = cp = NULL; error = VOP_GETATTR(nd.ni_vp, &vattr, cred); if (error) goto bad; /* * XXX: we need to limit this number to some reasonable value */ if (vattr.va_size > LINKER_HINTS_MAX) { printf("linker.hints file too large %ld\n", (long)vattr.va_size); goto bad; } hints = malloc(vattr.va_size, M_TEMP, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0, UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td); if (error) goto bad; VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, cred, td); nd.ni_vp = NULL; if (reclen != 0) { printf("can't read %zd\n", reclen); goto bad; } intp = (int *)hints; ival = *intp++; if (ival != LINKER_HINTS_VERSION) { printf("linker.hints file version mismatch %d\n", ival); goto bad; } bufend = hints + vattr.va_size; recptr = (u_char *)intp; clen = blen = 0; while (recptr < bufend && !found) { intp = (int *)recptr; reclen = *intp++; ival = *intp++; cp = (char *)intp; switch (ival) { case MDT_VERSION: clen = *cp++; if (clen != modnamelen || bcmp(cp, modname, clen) != 0) break; cp += clen; INT_ALIGN(hints, cp); ival = *(int *)cp; cp += sizeof(int); clen = *cp++; if (verinfo == NULL || ival == verinfo->md_ver_preferred) { found = 1; break; } if (ival >= verinfo->md_ver_minimum && ival <= verinfo->md_ver_maximum && ival > bestver) { bestver = ival; best = cp; blen = clen; } break; default: break; } recptr += reclen + sizeof(int); } /* * Finally check if KLD is in the place */ if (found) result = linker_lookup_file(path, pathlen, cp, clen, &mattr); else if (best) result = linker_lookup_file(path, pathlen, best, blen, &mattr); /* * KLD is newer than hints file. What we should do now? */ if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >)) printf("warning: KLD '%s' is newer than the linker.hints" " file\n", result); bad: free(pathbuf, M_LINKER); if (hints) free(hints, M_TEMP); if (nd.ni_vp != NULL) { VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, cred, td); } /* * If nothing found or hints is absent - fallback to the old * way by using "kldname[.ko]" as module name. */ if (!found && !bestver && result == NULL) result = linker_lookup_file(path, pathlen, modname, modnamelen, NULL); return (result); } /* * Lookup KLD which contains requested module in the all directories. */ static char * linker_search_module(const char *modname, int modnamelen, const struct mod_depend *verinfo) { char *cp, *ep, *result; /* * traverse the linker path */ for (cp = linker_path; *cp; cp = ep + 1) { /* find the end of this component */ for (ep = cp; (*ep != 0) && (*ep != ';'); ep++); result = linker_hints_lookup(cp, ep - cp, modname, modnamelen, verinfo); if (result != NULL) return (result); if (*ep == 0) break; } return (NULL); } /* * Search for module in all directories listed in the linker_path. */ static char * linker_search_kld(const char *name) { char *cp, *ep, *result; int len; /* qualified at all? */ if (strchr(name, '/')) return (strdup(name, M_LINKER)); /* traverse the linker path */ len = strlen(name); for (ep = linker_path; *ep; ep++) { cp = ep; /* find the end of this component */ for (; *ep != 0 && *ep != ';'; ep++); result = linker_lookup_file(cp, ep - cp, name, len, NULL); if (result != NULL) return (result); } return (NULL); } static const char * linker_basename(const char *path) { const char *filename; filename = strrchr(path, '/'); if (filename == NULL) return path; if (filename[1]) filename++; return (filename); } #ifdef HWPMC_HOOKS /* * Inform hwpmc about the set of kernel modules currently loaded. */ void * linker_hwpmc_list_objects(void) { linker_file_t lf; struct pmckern_map_in *kobase; int i, nmappings; nmappings = 0; sx_slock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) nmappings++; /* Allocate nmappings + 1 entries. */ kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in), M_LINKER, M_WAITOK | M_ZERO); i = 0; TAILQ_FOREACH(lf, &linker_files, link) { /* Save the info for this linker file. */ kobase[i].pm_file = lf->filename; kobase[i].pm_address = (uintptr_t)lf->address; i++; } sx_sunlock(&kld_sx); KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?")); /* The last entry of the malloced area comprises of all zeros. */ KASSERT(kobase[i].pm_file == NULL, ("linker_hwpmc_list_objects: last object not NULL")); return ((void *)kobase); } #endif /* check if root file system is not mounted */ static bool linker_root_mounted(void) { struct pwd *pwd; bool ret; if (rootvnode == NULL) return (false); pwd = pwd_hold(curthread); ret = pwd->pwd_rdir != NULL; pwd_drop(pwd); return (ret); } /* * Find a file which contains given module and load it, if "parent" is not * NULL, register a reference to it. */ static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp) { linker_file_t lfdep; const char *filename; char *pathname; int error; sx_assert(&kld_sx, SA_XLOCKED); if (modname == NULL) { /* * We have to load KLD */ KASSERT(verinfo == NULL, ("linker_load_module: verinfo" " is not NULL")); if (!linker_root_mounted()) return (ENXIO); pathname = linker_search_kld(kldname); } else { if (modlist_lookup2(modname, verinfo) != NULL) return (EEXIST); if (!linker_root_mounted()) return (ENXIO); if (kldname != NULL) pathname = strdup(kldname, M_LINKER); else /* * Need to find a KLD with required module */ pathname = linker_search_module(modname, strlen(modname), verinfo); } if (pathname == NULL) return (ENOENT); /* * Can't load more than one file with the same basename XXX: * Actually it should be possible to have multiple KLDs with * the same basename but different path because they can * provide different versions of the same modules. */ filename = linker_basename(pathname); if (linker_find_file_by_name(filename)) error = EEXIST; else do { error = linker_load_file(pathname, &lfdep); if (error) break; if (modname && verinfo && modlist_lookup2(modname, verinfo) == NULL) { linker_file_unload(lfdep, LINKER_UNLOAD_FORCE); error = ENOENT; break; } if (parent) { error = linker_file_add_dependency(parent, lfdep); if (error) break; } if (lfpp) *lfpp = lfdep; } while (0); free(pathname, M_LINKER); return (error); } /* * This routine is responsible for finding dependencies of userland initiated * kldload(2)'s of files. */ int linker_load_dependencies(linker_file_t lf) { linker_file_t lfdep; struct mod_metadata **start, **stop, **mdp, **nmdp; struct mod_metadata *mp, *nmp; const struct mod_depend *verinfo; modlist_t mod; const char *modname, *nmodname; int ver, error = 0; /* * All files are dependent on /kernel. */ sx_assert(&kld_sx, SA_XLOCKED); if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) return (error); } if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL) != 0) return (0); for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; mod = modlist_lookup(modname, ver); if (mod != NULL) { printf("interface %s.%d already present in the KLD" " '%s'!\n", modname, ver, mod->container->filename); return (EEXIST); } } for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; nmodname = NULL; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop)/* early exit, it's a self reference */ continue; mod = modlist_lookup2(modname, verinfo); if (mod) { /* woohoo, it's loaded already */ lfdep = mod->container; lfdep->refs++; error = linker_file_add_dependency(lf, lfdep); if (error) break; continue; } error = linker_load_module(NULL, modname, lf, verinfo, NULL); if (error) { printf("KLD %s: depends on %s - not available or" " version mismatch\n", lf->filename, modname); break; } } if (error) return (error); linker_addmodules(lf, start, stop, 0); return (error); } static int sysctl_kern_function_list_iterate(const char *name, void *opaque) { struct sysctl_req *req; req = opaque; return (SYSCTL_OUT(req, name, strlen(name) + 1)); } /* * Export a nul-separated, double-nul-terminated list of all function names * in the kernel. */ static int sysctl_kern_function_list(SYSCTL_HANDLER_ARGS) { linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(req->td->td_ucred); if (error) return (error); #endif error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { error = LINKER_EACH_FUNCTION_NAME(lf, sysctl_kern_function_list_iterate, req); if (error) { sx_xunlock(&kld_sx); return (error); } } sx_xunlock(&kld_sx); return (SYSCTL_OUT(req, "", 1)); } SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_function_list, "", "kernel function list"); diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index 341c94cc80aa..9710f6ca1fe0 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -1,2493 +1,2495 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 * The Regents of the University of California. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2000-2001 Robert N. M. Watson. * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 */ /* * System calls related to processes and protection */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#ifdef COMPAT_43 #include +#endif #include #include #include #include #include #include #include #include #include #ifdef REGRESSION FEATURE(regression, "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)"); #endif #include #include static MALLOC_DEFINE(M_CRED, "cred", "credentials"); SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "BSD security policy"); static void crfree_final(struct ucred *cr); static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups); #ifndef _SYS_SYSPROTO_H_ struct getpid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getpid(struct thread *td, struct getpid_args *uap) { struct proc *p = td->td_proc; td->td_retval[0] = p->p_pid; #if defined(COMPAT_43) if (SV_PROC_FLAG(p, SV_AOUT)) td->td_retval[1] = kern_getppid(td); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct getppid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getppid(struct thread *td, struct getppid_args *uap) { td->td_retval[0] = kern_getppid(td); return (0); } int kern_getppid(struct thread *td) { struct proc *p = td->td_proc; return (p->p_oppid); } /* * Get process group ID; note that POSIX getpgrp takes no parameter. */ #ifndef _SYS_SYSPROTO_H_ struct getpgrp_args { int dummy; }; #endif int sys_getpgrp(struct thread *td, struct getpgrp_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* Get an arbitrary pid's process group id */ #ifndef _SYS_SYSPROTO_H_ struct getpgid_args { pid_t pid; }; #endif int sys_getpgid(struct thread *td, struct getpgid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* * Get an arbitrary pid's session id. */ #ifndef _SYS_SYSPROTO_H_ struct getsid_args { pid_t pid; }; #endif int sys_getsid(struct thread *td, struct getsid_args *uap) { return (kern_getsid(td, uap->pid)); } int kern_getsid(struct thread *td, pid_t pid) { struct proc *p; int error; if (pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_session->s_sid; PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct getuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getuid(struct thread *td, struct getuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_ruid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_uid; #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct geteuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_geteuid(struct thread *td, struct geteuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_uid; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getgid(struct thread *td, struct getgid_args *uap) { td->td_retval[0] = td->td_ucred->cr_rgid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_groups[0]; #endif return (0); } /* * Get effective group ID. The "egid" is groups[0], and could be obtained * via getgroups. This syscall exists because it is somewhat painful to do * correctly in a library function. */ #ifndef _SYS_SYSPROTO_H_ struct getegid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getegid(struct thread *td, struct getegid_args *uap) { td->td_retval[0] = td->td_ucred->cr_groups[0]; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgroups_args { int gidsetsize; gid_t *gidset; }; #endif int sys_getgroups(struct thread *td, struct getgroups_args *uap) { struct ucred *cred; int ngrp, error; cred = td->td_ucred; ngrp = cred->cr_ngroups; if (uap->gidsetsize == 0) { error = 0; goto out; } if (uap->gidsetsize < ngrp) return (EINVAL); error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t)); out: td->td_retval[0] = ngrp; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setsid_args { int dummy; }; #endif /* ARGSUSED */ int sys_setsid(struct thread *td, struct setsid_args *uap) { struct pgrp *pgrp; int error; struct proc *p = td->td_proc; struct pgrp *newpgrp; struct session *newsess; error = 0; pgrp = NULL; newpgrp = uma_zalloc(pgrp_zone, M_WAITOK); newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) PGRP_UNLOCK(pgrp); error = EPERM; } else { (void)enterpgrp(p, p->p_pid, newpgrp, newsess); td->td_retval[0] = p->p_pid; newpgrp = NULL; newsess = NULL; } sx_xunlock(&proctree_lock); uma_zfree(pgrp_zone, newpgrp); free(newsess, M_SESSION); return (error); } /* * set process group (setpgid/old setpgrp) * * caller does setpgid(targpid, targpgid) * * pid must be caller or child of caller (ESRCH) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ #ifndef _SYS_SYSPROTO_H_ struct setpgid_args { int pid; /* target process id */ int pgid; /* target pgrp id */ }; #endif /* ARGSUSED */ int sys_setpgid(struct thread *td, struct setpgid_args *uap) { struct proc *curp = td->td_proc; struct proc *targp; /* target process */ struct pgrp *pgrp; /* target pgrp */ int error; struct pgrp *newpgrp; if (uap->pgid < 0) return (EINVAL); error = 0; newpgrp = uma_zalloc(pgrp_zone, M_WAITOK); sx_xlock(&proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; goto done; } if (!inferior(targp)) { PROC_UNLOCK(targp); error = ESRCH; goto done; } if ((error = p_cansee(td, targp))) { PROC_UNLOCK(targp); goto done; } if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) { PROC_UNLOCK(targp); error = EPERM; goto done; } if (targp->p_flag & P_EXEC) { PROC_UNLOCK(targp); error = EACCES; goto done; } PROC_UNLOCK(targp); } else targp = curp; if (SESS_LEADER(targp)) { error = EPERM; goto done; } if (uap->pgid == 0) uap->pgid = targp->p_pid; if ((pgrp = pgfind(uap->pgid)) == NULL) { if (uap->pgid == targp->p_pid) { error = enterpgrp(targp, uap->pgid, newpgrp, NULL); if (error == 0) newpgrp = NULL; } else error = EPERM; } else { if (pgrp == targp->p_pgrp) { PGRP_UNLOCK(pgrp); goto done; } if (pgrp->pg_id != targp->p_pid && pgrp->pg_session != curp->p_session) { PGRP_UNLOCK(pgrp); error = EPERM; goto done; } PGRP_UNLOCK(pgrp); error = enterthispgrp(targp, pgrp); } done: sx_xunlock(&proctree_lock); KASSERT((error == 0) || (newpgrp != NULL), ("setpgid failed and newpgrp is NULL")); uma_zfree(pgrp_zone, newpgrp); return (error); } /* * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD * compatible. It says that setting the uid/gid to euid/egid is a special * case of "appropriate privilege". Once the rules are expanded out, this * basically means that setuid(nnn) sets all three id's, in all permitted * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) * does not set the saved id - this is dangerous for traditional BSD * programs. For this reason, we *really* do not want to set * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. */ #define POSIX_APPENDIX_B_4_2_2 #ifndef _SYS_SYSPROTO_H_ struct setuid_args { uid_t uid; }; #endif /* ARGSUSED */ int sys_setuid(struct thread *td, struct setuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t uid; struct uidinfo *uip; int error; uid = uap->uid; AUDIT_ARG_UID(uid); newcred = crget(); uip = uifind(uid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setuid(oldcred, uid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setuid(geteuid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setuid(xx)" sets all * three id's (assuming you have privs). * * Notes on the logic. We do things in three steps. * 1: We determine if the euid is going to change, and do EPERM * right away. We unconditionally change the euid later if this * test is satisfied, simplifying that part of the logic. * 2: We determine if the real and/or saved uids are going to * change. Determined by compile options. * 3: Change euid last. (after tests in #2 for "appropriate privs") */ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */ #ifdef _POSIX_SAVED_IDS uid != oldcred->cr_svuid && /* allow setuid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETUID)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or uid == euid) * If so, we are changing the real uid and/or saved uid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ uid == oldcred->cr_uid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETUID) == 0) #endif { /* * Set the real uid and transfer proc count to new user. */ if (uid != oldcred->cr_ruid) { change_ruid(newcred, uip); setsugid(p); } /* * Set saved uid * * XXX always set saved uid even if not _POSIX_SAVED_IDS, as * the security of seteuid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (uid != oldcred->cr_svuid) { change_svuid(newcred, uid); setsugid(p); } } /* * In all permitted cases, we are changing the euid. */ if (uid != oldcred->cr_uid) { change_euid(newcred, uip); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(uip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(uip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct seteuid_args { uid_t euid; }; #endif /* ARGSUSED */ int sys_seteuid(struct thread *td, struct seteuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid; struct uidinfo *euip; int error; euid = uap->euid; AUDIT_ARG_EUID(euid); newcred = crget(); euip = uifind(euid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_seteuid(oldcred, euid); if (error) goto fail; #endif if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID)) != 0) goto fail; /* * Everything's okay, do it. */ if (oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgid_args { gid_t gid; }; #endif /* ARGSUSED */ int sys_setgid(struct thread *td, struct setgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t gid; int error; gid = uap->gid; AUDIT_ARG_GID(gid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgid(oldcred, gid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setgid(getegid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setgid(xx)" sets all * three id's (assuming you have privs). * * For notes on the logic here, see setuid() above. */ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */ #ifdef _POSIX_SAVED_IDS gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or gid == egid) * If so, we are changing the real uid and saved gid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ gid == oldcred->cr_groups[0] || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0) #endif { /* * Set real gid */ if (oldcred->cr_rgid != gid) { change_rgid(newcred, gid); setsugid(p); } /* * Set saved gid * * XXX always set saved gid even if not _POSIX_SAVED_IDS, as * the security of setegid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (oldcred->cr_svgid != gid) { change_svgid(newcred, gid); setsugid(p); } } /* * In all cases permitted cases, we are changing the egid. * Copy credentials so other references do not see our changes. */ if (oldcred->cr_groups[0] != gid) { change_egid(newcred, gid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setegid_args { gid_t egid; }; #endif /* ARGSUSED */ int sys_setegid(struct thread *td, struct setegid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid; int error; egid = uap->egid; AUDIT_ARG_EGID(egid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setegid(oldcred, egid); if (error) goto fail; #endif if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0) goto fail; if (oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgroups_args { int gidsetsize; gid_t *gidset; }; #endif /* ARGSUSED */ int sys_setgroups(struct thread *td, struct setgroups_args *uap) { gid_t smallgroups[XU_NGROUPS]; gid_t *groups; int gidsetsize, error; gidsetsize = uap->gidsetsize; if (gidsetsize > ngroups_max + 1 || gidsetsize < 0) return (EINVAL); if (gidsetsize > XU_NGROUPS) groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK); else groups = smallgroups; error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t)); if (error == 0) error = kern_setgroups(td, gidsetsize, groups); if (gidsetsize > XU_NGROUPS) free(groups, M_TEMP); return (error); } int kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; int error; MPASS(ngrp <= ngroups_max + 1); AUDIT_ARG_GROUPSET(groups, ngrp); newcred = crget(); crextend(newcred, ngrp); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgroups(oldcred, ngrp, groups); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS); if (error) goto fail; if (ngrp == 0) { /* * setgroups(0, NULL) is a legitimate way of clearing the * groups vector on non-BSD systems (which generally do not * have the egid in the groups[0]). We risk security holes * when running non-BSD software if we do not do the same. */ newcred->cr_ngroups = 1; } else { crsetgroups_locked(newcred, ngrp, groups); } setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setreuid_args { uid_t ruid; uid_t euid; }; #endif /* ARGSUSED */ int sys_setreuid(struct thread *td, struct setreuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setreuid(oldcred, ruid, euid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid) || (euid != (uid_t)-1 && euid != oldcred->cr_uid && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) && newcred->cr_svuid != newcred->cr_uid) { change_svuid(newcred, newcred->cr_uid); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setregid_args { gid_t rgid; gid_t egid; }; #endif /* ARGSUSED */ int sys_setregid(struct thread *td, struct setregid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid; int error; egid = uap->egid; rgid = uap->rgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setregid(oldcred, rgid, egid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid) || (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) && newcred->cr_svgid != newcred->cr_groups[0]) { change_svgid(newcred, newcred->cr_groups[0]); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } /* * setresuid(ruid, euid, suid) is like setreuid except control over the saved * uid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresuid_args { uid_t ruid; uid_t euid; uid_t suid; }; #endif /* ARGSUSED */ int sys_setresuid(struct thread *td, struct setresuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid, suid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; suid = uap->suid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); AUDIT_ARG_SUID(suid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresuid(oldcred, ruid, euid, suid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid && ruid != oldcred->cr_uid) || (euid != (uid_t)-1 && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid && euid != oldcred->cr_uid) || (suid != (uid_t)-1 && suid != oldcred->cr_ruid && suid != oldcred->cr_svuid && suid != oldcred->cr_uid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) { change_svuid(newcred, suid); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } /* * setresgid(rgid, egid, sgid) is like setregid except control over the saved * gid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresgid_args { gid_t rgid; gid_t egid; gid_t sgid; }; #endif /* ARGSUSED */ int sys_setresgid(struct thread *td, struct setresgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid, sgid; int error; egid = uap->egid; rgid = uap->rgid; sgid = uap->sgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); AUDIT_ARG_SGID(sgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid && rgid != oldcred->cr_groups[0]) || (egid != (gid_t)-1 && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid && egid != oldcred->cr_groups[0]) || (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && sgid != oldcred->cr_svgid && sgid != oldcred->cr_groups[0])) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) { change_svgid(newcred, sgid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getresuid_args { uid_t *ruid; uid_t *euid; uid_t *suid; }; #endif /* ARGSUSED */ int sys_getresuid(struct thread *td, struct getresuid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->ruid) error1 = copyout(&cred->cr_ruid, uap->ruid, sizeof(cred->cr_ruid)); if (uap->euid) error2 = copyout(&cred->cr_uid, uap->euid, sizeof(cred->cr_uid)); if (uap->suid) error3 = copyout(&cred->cr_svuid, uap->suid, sizeof(cred->cr_svuid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct getresgid_args { gid_t *rgid; gid_t *egid; gid_t *sgid; }; #endif /* ARGSUSED */ int sys_getresgid(struct thread *td, struct getresgid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->rgid) error1 = copyout(&cred->cr_rgid, uap->rgid, sizeof(cred->cr_rgid)); if (uap->egid) error2 = copyout(&cred->cr_groups[0], uap->egid, sizeof(cred->cr_groups[0])); if (uap->sgid) error3 = copyout(&cred->cr_svgid, uap->sgid, sizeof(cred->cr_svgid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct issetugid_args { int dummy; }; #endif /* ARGSUSED */ int sys_issetugid(struct thread *td, struct issetugid_args *uap) { struct proc *p = td->td_proc; /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, * we use P_SUGID because we consider changing the owners as * "tainting" as well. * This is significant for procs that start as root and "become" * a user without an exec - programs cannot know *everything* * that libc *might* have put in their data segment. */ td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0; return (0); } int sys___setugid(struct thread *td, struct __setugid_args *uap) { #ifdef REGRESSION struct proc *p; p = td->td_proc; switch (uap->flag) { case 0: PROC_LOCK(p); p->p_flag &= ~P_SUGID; PROC_UNLOCK(p); return (0); case 1: PROC_LOCK(p); p->p_flag |= P_SUGID; PROC_UNLOCK(p); return (0); default: return (EINVAL); } #else /* !REGRESSION */ return (ENOSYS); #endif /* REGRESSION */ } /* * Check if gid is a member of the group set. */ int groupmember(gid_t gid, struct ucred *cred) { int l; int h; int m; if (cred->cr_groups[0] == gid) return(1); /* * If gid was not our primary group, perform a binary search * of the supplemental groups. This is possible because we * sort the groups in crsetgroups(). */ l = 1; h = cred->cr_ngroups; while (l < h) { m = l + ((h - l) / 2); if (cred->cr_groups[m] < gid) l = m + 1; else h = m; } if ((l < cred->cr_ngroups) && (cred->cr_groups[l] == gid)) return (1); return (0); } /* * Test the active securelevel against a given level. securelevel_gt() * implements (securelevel > level). securelevel_ge() implements * (securelevel >= level). Note that the logic is inverted -- these * functions return EPERM on "success" and 0 on "failure". * * Due to care taken when setting the securelevel, we know that no jail will * be less secure that its parent (or the physical system), so it is sufficient * to test the current jail only. * * XXXRW: Possibly since this has to do with privilege, it should move to * kern_priv.c. */ int securelevel_gt(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel > level ? EPERM : 0); } int securelevel_ge(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0); } /* * 'see_other_uids' determines whether or not visibility of processes * and sockets with credentials holding different real uids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_uids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW, &see_other_uids, 0, "Unprivileged processes may see subjects/objects with different real uid"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_other_uids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_canseeotheruids(struct ucred *u1, struct ucred *u2) { if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) { if (priv_check_cred(u1, PRIV_SEEOTHERUIDS) != 0) return (ESRCH); } return (0); } /* * 'see_other_gids' determines whether or not visibility of processes * and sockets with credentials holding different real gids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_gids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW, &see_other_gids, 0, "Unprivileged processes may see subjects/objects with different real gid"); /* * Determine if u1 can "see" the subject specified by u2, according to the * 'see_other_gids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_canseeothergids(struct ucred *u1, struct ucred *u2) { int i, match; if (!see_other_gids) { match = 0; for (i = 0; i < u1->cr_ngroups; i++) { if (groupmember(u1->cr_groups[i], u2)) match = 1; if (match) break; } if (!match) { if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) != 0) return (ESRCH); } } return (0); } /* * 'see_jail_proc' determines whether or not visibility of processes and * sockets with credentials holding different jail ids is possible using a * variety of system MIBs. * * XXX: data declarations should be together near the beginning of the file. */ static int see_jail_proc = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_jail_proc, CTLFLAG_RW, &see_jail_proc, 0, "Unprivileged processes may see subjects/objects with different jail ids"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_jail_proc' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_canseejailproc(struct ucred *u1, struct ucred *u2) { if (u1->cr_uid == 0) return (0); return (!see_jail_proc && u1->cr_prison != u2->cr_prison ? ESRCH : 0); } /*- * Determine if u1 "can see" the subject specified by u2. * Returns: 0 for permitted, an errno value otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_cansee(struct ucred *u1, struct ucred *u2) { int error; if ((error = prison_check(u1, u2))) return (error); #ifdef MAC if ((error = mac_cred_check_visible(u1, u2))) return (error); #endif if ((error = cr_canseeotheruids(u1, u2))) return (error); if ((error = cr_canseeothergids(u1, u2))) return (error); if ((error = cr_canseejailproc(u1, u2))) return (error); return (0); } /*- * Determine if td "can see" the subject specified by p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect p->p_ucred must be held. td really * should be curthread. * References: td and p must be valid for the lifetime of the call */ int p_cansee(struct thread *td, struct proc *p) { /* Wrap cr_cansee() for all functionality. */ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); return (cr_cansee(td->td_ucred, p->p_ucred)); } /* * 'conservative_signals' prevents the delivery of a broad class of * signals by unprivileged processes to processes that have changed their * credentials since the last invocation of execve(). This can prevent * the leakage of cached information or retained privileges as a result * of a common class of signal-related vulnerabilities. However, this * may interfere with some applications that expect to be able to * deliver these signals to peer processes after having given up * privilege. */ static int conservative_signals = 1; SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW, &conservative_signals, 0, "Unprivileged processes prevented from " "sending certain signals to processes whose credentials have changed"); /*- * Determine whether cred may deliver the specified signal to proc. * Returns: 0 for permitted, an errno value otherwise. * Locks: A lock must be held for proc. * References: cred and proc must be valid for the lifetime of the call. */ int cr_cansignal(struct ucred *cred, struct proc *proc, int signum) { int error; PROC_LOCK_ASSERT(proc, MA_OWNED); /* * Jail semantics limit the scope of signalling to proc in the * same jail as cred, if cred is in jail. */ error = prison_check(cred, proc->p_ucred); if (error) return (error); #ifdef MAC if ((error = mac_proc_check_signal(cred, proc, signum))) return (error); #endif if ((error = cr_canseeotheruids(cred, proc->p_ucred))) return (error); if ((error = cr_canseeothergids(cred, proc->p_ucred))) return (error); /* * UNIX signal semantics depend on the status of the P_SUGID * bit on the target process. If the bit is set, then additional * restrictions are placed on the set of available signals. */ if (conservative_signals && (proc->p_flag & P_SUGID)) { switch (signum) { case 0: case SIGKILL: case SIGINT: case SIGTERM: case SIGALRM: case SIGSTOP: case SIGTTIN: case SIGTTOU: case SIGTSTP: case SIGHUP: case SIGUSR1: case SIGUSR2: /* * Generally, permit job and terminal control * signals. */ break; default: /* Not permitted without privilege. */ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID); if (error) return (error); } } /* * Generally, the target credential's ruid or svuid must match the * subject credential's ruid or euid. */ if (cred->cr_ruid != proc->p_ucred->cr_ruid && cred->cr_ruid != proc->p_ucred->cr_svuid && cred->cr_uid != proc->p_ucred->cr_ruid && cred->cr_uid != proc->p_ucred->cr_svuid) { error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED); if (error) return (error); } return (0); } /*- * Determine whether td may deliver the specified signal to p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must be * held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansignal(struct thread *td, struct proc *p, int signum) { KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); /* * UNIX signalling semantics require that processes in the same * session always be able to deliver SIGCONT to one another, * overriding the remaining protections. */ /* XXX: This will require an additional lock of some sort. */ if (signum == SIGCONT && td->td_proc->p_session == p->p_session) return (0); /* * Some compat layers use SIGTHR and higher signals for * communication between different kernel threads of the same * process, so that they expect that it's always possible to * deliver them, even for suid applications where cr_cansignal() can * deny such ability for security consideration. It should be * pretty safe to do since the only way to create two processes * with the same p_leader is via rfork(2). */ if (td->td_proc->p_leader != NULL && signum >= SIGTHR && signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader) return (0); return (cr_cansignal(td->td_ucred, p, signum)); } /*- * Determine whether td may reschedule p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansched(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_sched(td->td_ucred, p))) return (error); #endif if ((error = cr_canseeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_canseeothergids(td->td_ucred, p->p_ucred))) return (error); if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid && td->td_ucred->cr_uid != p->p_ucred->cr_ruid) { error = priv_check(td, PRIV_SCHED_DIFFCRED); if (error) return (error); } return (0); } /* * Handle getting or setting the prison's unprivileged_proc_debug * value. */ static int sysctl_unprivileged_proc_debug(SYSCTL_HANDLER_ARGS) { int error, val; val = prison_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG); error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && val != 1) return (EINVAL); prison_set_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG, val); return (0); } /* * The 'unprivileged_proc_debug' flag may be used to disable a variety of * unprivileged inter-process debugging services, including some procfs * functionality, ptrace(), and ktrace(). In the past, inter-process * debugging has been involved in a variety of security problems, and sites * not requiring the service might choose to disable it when hardening * systems. */ SYSCTL_PROC(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_SECURE | CTLFLAG_MPSAFE, 0, 0, sysctl_unprivileged_proc_debug, "I", "Unprivileged processes may use process debugging facilities"); /*- * Determine whether td may debug p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_candebug(struct thread *td, struct proc *p) { int credentialchanged, error, grpsubset, i, uidsubset; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = priv_check(td, PRIV_DEBUG_UNPRIV))) return (error); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_debug(td->td_ucred, p))) return (error); #endif if ((error = cr_canseeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_canseeothergids(td->td_ucred, p->p_ucred))) return (error); /* * Is p's group set a subset of td's effective group set? This * includes p's egid, group access list, rgid, and svgid. */ grpsubset = 1; for (i = 0; i < p->p_ucred->cr_ngroups; i++) { if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) { grpsubset = 0; break; } } grpsubset = grpsubset && groupmember(p->p_ucred->cr_rgid, td->td_ucred) && groupmember(p->p_ucred->cr_svgid, td->td_ucred); /* * Are the uids present in p's credential equal to td's * effective uid? This includes p's euid, svuid, and ruid. */ uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid && td->td_ucred->cr_uid == p->p_ucred->cr_svuid && td->td_ucred->cr_uid == p->p_ucred->cr_ruid); /* * Has the credential of the process changed since the last exec()? */ credentialchanged = (p->p_flag & P_SUGID); /* * If p's gids aren't a subset, or the uids aren't a subset, * or the credential has changed, require appropriate privilege * for td to debug p. */ if (!grpsubset || !uidsubset) { error = priv_check(td, PRIV_DEBUG_DIFFCRED); if (error) return (error); } if (credentialchanged) { error = priv_check(td, PRIV_DEBUG_SUGID); if (error) return (error); } /* Can't trace init when securelevel > 0. */ if (p == initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); } /* * Can't trace a process that's currently exec'ing. * * XXX: Note, this is not a security policy decision, it's a * basic correctness/functionality decision. Therefore, this check * should be moved to the caller's of p_candebug(). */ if ((p->p_flag & P_INEXEC) != 0) return (EBUSY); /* Denied explicitly */ if ((p->p_flag2 & P2_NOTRACE) != 0) { error = priv_check(td, PRIV_DEBUG_DENIED); if (error != 0) return (error); } return (0); } /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseesocket(struct ucred *cred, struct socket *so) { int error; error = prison_check(cred, so->so_cred); if (error) return (ENOENT); #ifdef MAC error = mac_socket_check_visible(cred, so); if (error) return (error); #endif if (cr_canseeotheruids(cred, so->so_cred)) return (ENOENT); if (cr_canseeothergids(cred, so->so_cred)) return (ENOENT); return (0); } /*- * Determine whether td can wait for the exit of p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_canwait(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_wait(td->td_ucred, p))) return (error); #endif #if 0 /* XXXMAC: This could have odd effects on some shells. */ if ((error = cr_canseeotheruids(td->td_ucred, p->p_ucred))) return (error); #endif return (0); } /* * Credential management. * * struct ucred objects are rarely allocated but gain and lose references all * the time (e.g., on struct file alloc/dealloc) turning refcount updates into * a significant source of cache-line ping ponging. Common cases are worked * around by modifying thread-local counter instead if the cred to operate on * matches td_realucred. * * The counter is split into 2 parts: * - cr_users -- total count of all struct proc and struct thread objects * which have given cred in p_ucred and td_ucred respectively * - cr_ref -- the actual ref count, only valid if cr_users == 0 * * If users == 0 then cr_ref behaves similarly to refcount(9), in particular if * the count reaches 0 the object is freeable. * If users > 0 and curthread->td_realucred == cred, then updates are performed * against td_ucredref. * In other cases updates are performed against cr_ref. * * Changing td_realucred into something else decrements cr_users and transfers * accumulated updates. */ struct ucred * crcowget(struct ucred *cr) { mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users++; cr->cr_ref++; mtx_unlock(&cr->cr_mtx); return (cr); } static struct ucred * crunuse(struct thread *td) { struct ucred *cr, *crold; MPASS(td->td_realucred == td->td_ucred); cr = td->td_realucred; mtx_lock(&cr->cr_mtx); cr->cr_ref += td->td_ucredref; td->td_ucredref = 0; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users--; if (cr->cr_users == 0) { KASSERT(cr->cr_ref > 0, ("%s: ref %d not > 0 on cred %p", __func__, cr->cr_ref, cr)); crold = cr; } else { cr->cr_ref--; crold = NULL; } mtx_unlock(&cr->cr_mtx); td->td_realucred = NULL; return (crold); } static void crunusebatch(struct ucred *cr, int users, int ref) { KASSERT(users > 0, ("%s: passed users %d not > 0 ; cred %p", __func__, users, cr)); mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users >= users, ("%s: users %d not > %d on cred %p", __func__, cr->cr_users, users, cr)); cr->cr_users -= users; cr->cr_ref += ref; cr->cr_ref -= users; if (cr->cr_users > 0) { mtx_unlock(&cr->cr_mtx); return; } KASSERT(cr->cr_ref >= 0, ("%s: ref %d not >= 0 on cred %p", __func__, cr->cr_ref, cr)); if (cr->cr_ref > 0) { mtx_unlock(&cr->cr_mtx); return; } crfree_final(cr); } void crcowfree(struct thread *td) { struct ucred *cr; cr = crunuse(td); if (cr != NULL) crfree(cr); } struct ucred * crcowsync(void) { struct thread *td; struct proc *p; struct ucred *crnew, *crold; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(td->td_realucred == td->td_ucred); if (td->td_realucred == p->p_ucred) return (NULL); crnew = crcowget(p->p_ucred); crold = crunuse(td); td->td_realucred = crnew; td->td_ucred = td->td_realucred; return (crold); } /* * Batching. */ void credbatch_add(struct credbatch *crb, struct thread *td) { struct ucred *cr; MPASS(td->td_realucred != NULL); MPASS(td->td_realucred == td->td_ucred); MPASS(td->td_state == TDS_INACTIVE); cr = td->td_realucred; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); if (crb->cred != cr) { if (crb->users > 0) { MPASS(crb->cred != NULL); crunusebatch(crb->cred, crb->users, crb->ref); crb->users = 0; crb->ref = 0; } } crb->cred = cr; crb->users++; crb->ref += td->td_ucredref; td->td_ucredref = 0; td->td_realucred = NULL; } void credbatch_final(struct credbatch *crb) { MPASS(crb->cred != NULL); MPASS(crb->users > 0); crunusebatch(crb->cred, crb->users, crb->ref); } /* * Allocate a zeroed cred structure. */ struct ucred * crget(void) { struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); mtx_init(&cr->cr_mtx, "cred", NULL, MTX_DEF); cr->cr_ref = 1; #ifdef AUDIT audit_cred_init(cr); #endif #ifdef MAC mac_cred_init(cr); #endif cr->cr_groups = cr->cr_smallgroups; cr->cr_agroups = sizeof(cr->cr_smallgroups) / sizeof(cr->cr_smallgroups[0]); return (cr); } /* * Claim another reference to a ucred structure. */ struct ucred * crhold(struct ucred *cr) { struct thread *td; td = curthread; if (__predict_true(td->td_realucred == cr)) { KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); td->td_ucredref++; return (cr); } mtx_lock(&cr->cr_mtx); cr->cr_ref++; mtx_unlock(&cr->cr_mtx); return (cr); } /* * Free a cred structure. Throws away space when ref count gets to 0. */ void crfree(struct ucred *cr) { struct thread *td; td = curthread; if (__predict_true(td->td_realucred == cr)) { KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); td->td_ucredref--; return; } mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users >= 0, ("%s: users %d not >= 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_ref--; if (cr->cr_users > 0) { mtx_unlock(&cr->cr_mtx); return; } KASSERT(cr->cr_ref >= 0, ("%s: ref %d not >= 0 on cred %p", __func__, cr->cr_ref, cr)); if (cr->cr_ref > 0) { mtx_unlock(&cr->cr_mtx); return; } crfree_final(cr); } static void crfree_final(struct ucred *cr) { KASSERT(cr->cr_users == 0, ("%s: users %d not == 0 on cred %p", __func__, cr->cr_users, cr)); KASSERT(cr->cr_ref == 0, ("%s: ref %d not == 0 on cred %p", __func__, cr->cr_ref, cr)); /* * Some callers of crget(), such as nfs_statfs(), allocate a temporary * credential, but don't allocate a uidinfo structure. */ if (cr->cr_uidinfo != NULL) uifree(cr->cr_uidinfo); if (cr->cr_ruidinfo != NULL) uifree(cr->cr_ruidinfo); if (cr->cr_prison != NULL) prison_free(cr->cr_prison); if (cr->cr_loginclass != NULL) loginclass_free(cr->cr_loginclass); #ifdef AUDIT audit_cred_destroy(cr); #endif #ifdef MAC mac_cred_destroy(cr); #endif mtx_destroy(&cr->cr_mtx); if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); free(cr, M_CRED); } /* * Copy a ucred's contents from a template. Does not block. */ void crcopy(struct ucred *dest, struct ucred *src) { KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred")); bcopy(&src->cr_startcopy, &dest->cr_startcopy, (unsigned)((caddr_t)&src->cr_endcopy - (caddr_t)&src->cr_startcopy)); crsetgroups(dest, src->cr_ngroups, src->cr_groups); uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); prison_hold(dest->cr_prison); loginclass_hold(dest->cr_loginclass); #ifdef AUDIT audit_cred_copy(src, dest); #endif #ifdef MAC mac_cred_copy(src, dest); #endif } /* * Dup cred struct to a new held one. */ struct ucred * crdup(struct ucred *cr) { struct ucred *newcr; newcr = crget(); crcopy(newcr, cr); return (newcr); } /* * Fill in a struct xucred based on a struct ucred. */ void cru2x(struct ucred *cr, struct xucred *xcr) { int ngroups; bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = cr->cr_uid; ngroups = MIN(cr->cr_ngroups, XU_NGROUPS); xcr->cr_ngroups = ngroups; bcopy(cr->cr_groups, xcr->cr_groups, ngroups * sizeof(*cr->cr_groups)); } void cru2xt(struct thread *td, struct xucred *xcr) { cru2x(td->td_ucred, xcr); xcr->cr_pid = td->td_proc->p_pid; } /* * Set initial process credentials. * Callers are responsible for providing the reference for provided credentials. */ void proc_set_cred_init(struct proc *p, struct ucred *newcred) { p->p_ucred = crcowget(newcred); } /* * Change process credentials. * Callers are responsible for providing the reference for passed credentials * and for freeing old ones. * * Process has to be locked except when it does not have credentials (as it * should not be visible just yet) or when newcred is NULL (as this can be * only used when the process is about to be freed, at which point it should * not be visible anymore). */ void proc_set_cred(struct proc *p, struct ucred *newcred) { struct ucred *cr; cr = p->p_ucred; MPASS(cr != NULL); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(newcred->cr_users == 0, ("%s: users %d not 0 on cred %p", __func__, newcred->cr_users, newcred)); mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users--; mtx_unlock(&cr->cr_mtx); p->p_ucred = newcred; newcred->cr_users = 1; PROC_UPDATE_COW(p); } void proc_unset_cred(struct proc *p) { struct ucred *cr; MPASS(p->p_state == PRS_ZOMBIE || p->p_state == PRS_NEW); cr = p->p_ucred; p->p_ucred = NULL; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); mtx_lock(&cr->cr_mtx); cr->cr_users--; if (cr->cr_users == 0) KASSERT(cr->cr_ref > 0, ("%s: ref %d not > 0 on cred %p", __func__, cr->cr_ref, cr)); mtx_unlock(&cr->cr_mtx); crfree(cr); } struct ucred * crcopysafe(struct proc *p, struct ucred *cr) { struct ucred *oldcred; int groups; PROC_LOCK_ASSERT(p, MA_OWNED); oldcred = p->p_ucred; while (cr->cr_agroups < oldcred->cr_agroups) { groups = oldcred->cr_agroups; PROC_UNLOCK(p); crextend(cr, groups); PROC_LOCK(p); oldcred = p->p_ucred; } crcopy(cr, oldcred); return (oldcred); } /* * Extend the passed in credential to hold n items. */ void crextend(struct ucred *cr, int n) { int cnt; /* Truncate? */ if (n <= cr->cr_agroups) return; /* * We extend by 2 each time since we're using a power of two * allocator until we need enough groups to fill a page. * Once we're allocating multiple pages, only allocate as many * as we actually need. The case of processes needing a * non-power of two number of pages seems more likely than * a real world process that adds thousands of groups one at a * time. */ if ( n < PAGE_SIZE / sizeof(gid_t) ) { if (cr->cr_agroups == 0) cnt = MAX(1, MINALLOCSIZE / sizeof(gid_t)); else cnt = cr->cr_agroups * 2; while (cnt < n) cnt *= 2; } else cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t)); /* Free the old array. */ if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK | M_ZERO); cr->cr_agroups = cnt; } /* * Copy groups in to a credential, preserving any necessary invariants. * Currently this includes the sorting of all supplemental gids. * crextend() must have been called before hand to ensure sufficient * space is available. */ static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups) { int i; int j; gid_t g; KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small")); bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t)); cr->cr_ngroups = ngrp; /* * Sort all groups except cr_groups[0] to allow groupmember to * perform a binary search. * * XXX: If large numbers of groups become common this should * be replaced with shell sort like linux uses or possibly * heap sort. */ for (i = 2; i < ngrp; i++) { g = cr->cr_groups[i]; for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--) cr->cr_groups[j + 1] = cr->cr_groups[j]; cr->cr_groups[j + 1] = g; } } /* * Copy groups in to a credential after expanding it if required. * Truncate the list to (ngroups_max + 1) if it is too large. */ void crsetgroups(struct ucred *cr, int ngrp, gid_t *groups) { if (ngrp > ngroups_max + 1) ngrp = ngroups_max + 1; crextend(cr, ngrp); crsetgroups_locked(cr, ngrp, groups); } /* * Get login name, if available. */ #ifndef _SYS_SYSPROTO_H_ struct getlogin_args { char *namebuf; u_int namelen; }; #endif /* ARGSUSED */ int sys_getlogin(struct thread *td, struct getlogin_args *uap) { char login[MAXLOGNAME]; struct proc *p = td->td_proc; size_t len; if (uap->namelen > MAXLOGNAME) uap->namelen = MAXLOGNAME; PROC_LOCK(p); SESS_LOCK(p->p_session); len = strlcpy(login, p->p_session->s_login, uap->namelen) + 1; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (len > uap->namelen) return (ERANGE); return (copyout(login, uap->namebuf, len)); } /* * Set login name. */ #ifndef _SYS_SYSPROTO_H_ struct setlogin_args { char *namebuf; }; #endif /* ARGSUSED */ int sys_setlogin(struct thread *td, struct setlogin_args *uap) { struct proc *p = td->td_proc; int error; char logintmp[MAXLOGNAME]; CTASSERT(sizeof(p->p_session->s_login) >= sizeof(logintmp)); error = priv_check(td, PRIV_PROC_SETLOGIN); if (error) return (error); error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL); if (error != 0) { if (error == ENAMETOOLONG) error = EINVAL; return (error); } AUDIT_ARG_LOGIN(logintmp); PROC_LOCK(p); SESS_LOCK(p->p_session); strcpy(p->p_session->s_login, logintmp); SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); return (0); } void setsugid(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_SUGID; } /*- * Change a process's effective uid. * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_euid(struct ucred *newcred, struct uidinfo *euip) { newcred->cr_uid = euip->ui_uid; uihold(euip); uifree(newcred->cr_uidinfo); newcred->cr_uidinfo = euip; } /*- * Change a process's effective gid. * Side effects: newcred->cr_gid will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_egid(struct ucred *newcred, gid_t egid) { newcred->cr_groups[0] = egid; } /*- * Change a process's real uid. * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo * will be updated, and the old and new cr_ruidinfo proc * counts will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_ruid(struct ucred *newcred, struct uidinfo *ruip) { (void)chgproccnt(newcred->cr_ruidinfo, -1, 0); newcred->cr_ruid = ruip->ui_uid; uihold(ruip); uifree(newcred->cr_ruidinfo); newcred->cr_ruidinfo = ruip; (void)chgproccnt(newcred->cr_ruidinfo, 1, 0); } /*- * Change a process's real gid. * Side effects: newcred->cr_rgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_rgid(struct ucred *newcred, gid_t rgid) { newcred->cr_rgid = rgid; } /*- * Change a process's saved uid. * Side effects: newcred->cr_svuid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svuid(struct ucred *newcred, uid_t svuid) { newcred->cr_svuid = svuid; } /*- * Change a process's saved gid. * Side effects: newcred->cr_svgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svgid(struct ucred *newcred, gid_t svgid) { newcred->cr_svgid = svgid; } bool allow_ptrace = true; SYSCTL_BOOL(_security_bsd, OID_AUTO, allow_ptrace, CTLFLAG_RWTUN, &allow_ptrace, 0, "Deny ptrace(2) use by returning ENOSYS"); diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 512e06814c24..94624e88af51 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -1,1366 +1,1365 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 The FreeBSD Foundation * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #ifdef RCTL #include #endif #ifdef RACCT FEATURE(racct, "Resource Accounting"); /* * Do not block processes that have their %cpu usage <= pcpu_threshold. */ static int pcpu_threshold = 1; #ifdef RACCT_DEFAULT_TO_DISABLED bool __read_frequently racct_enable = false; #else bool __read_frequently racct_enable = true; #endif SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Resource Accounting"); SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 0, "Enable RACCT/RCTL"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 0, "Processes with higher %cpu usage than this value can be throttled."); /* * How many seconds it takes to use the scheduler %cpu calculations. When a * process starts, we compute its %cpu usage by dividing its runtime by the * process wall clock time. After RACCT_PCPU_SECS pass, we use the value * provided by the scheduler. */ #define RACCT_PCPU_SECS 3 struct mtx racct_lock; MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); static uma_zone_t racct_zone; static void racct_sub_racct(struct racct *dest, const struct racct *src); static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); SDT_PROVIDER_DEFINE(racct); SDT_PROBE_DEFINE3(racct, , rusage, add, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__buf, "struct proc *", "const struct buf *", "int"); SDT_PROBE_DEFINE3(racct, , rusage, add__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE1(racct, , racct, create, "struct racct *"); SDT_PROBE_DEFINE1(racct, , racct, destroy, "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, join, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, join__failure, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, leave, "struct racct *", "struct racct *"); int racct_types[] = { [RACCT_CPU] = RACCT_IN_MILLIONS, [RACCT_DATA] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_STACK] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_CORE] = RACCT_DENIABLE, [RACCT_RSS] = RACCT_RECLAIMABLE, [RACCT_MEMLOCK] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NPROC] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NOFILE] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_VMEM] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NPTS] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SWAP] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NTHR] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_MSGQQUEUED] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_MSGQSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NMSGQ] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEMOP] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NSHM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SHMSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, [RACCT_READBPS] = RACCT_DECAYING, [RACCT_WRITEBPS] = RACCT_DECAYING, [RACCT_READIOPS] = RACCT_DECAYING, [RACCT_WRITEIOPS] = RACCT_DECAYING }; static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; #ifdef SCHED_4BSD /* * Contains intermediate values for %cpu calculations to avoid using floating * point in the kernel. * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to * zero so the calculations are more straightforward. */ fixpt_t ccpu_exp[] = { [0] = FSCALE * 1, [1] = FSCALE * 0.95122942450071400909, [2] = FSCALE * 0.90483741803595957316, [3] = FSCALE * 0.86070797642505780722, [4] = FSCALE * 0.81873075307798185866, [5] = FSCALE * 0.77880078307140486824, [6] = FSCALE * 0.74081822068171786606, [7] = FSCALE * 0.70468808971871343435, [8] = FSCALE * 0.67032004603563930074, [9] = FSCALE * 0.63762815162177329314, [10] = FSCALE * 0.60653065971263342360, [11] = FSCALE * 0.57694981038048669531, [12] = FSCALE * 0.54881163609402643262, [13] = FSCALE * 0.52204577676101604789, [14] = FSCALE * 0.49658530379140951470, [15] = FSCALE * 0.47236655274101470713, [16] = FSCALE * 0.44932896411722159143, [17] = FSCALE * 0.42741493194872666992, [18] = FSCALE * 0.40656965974059911188, [19] = FSCALE * 0.38674102345450120691, [20] = FSCALE * 0.36787944117144232159, [21] = FSCALE * 0.34993774911115535467, [22] = FSCALE * 0.33287108369807955328, [23] = FSCALE * 0.31663676937905321821, [24] = FSCALE * 0.30119421191220209664, [25] = FSCALE * 0.28650479686019010032, [26] = FSCALE * 0.27253179303401260312, [27] = FSCALE * 0.25924026064589150757, [28] = FSCALE * 0.24659696394160647693, [29] = FSCALE * 0.23457028809379765313, [30] = FSCALE * 0.22313016014842982893, [31] = FSCALE * 0.21224797382674305771, [32] = FSCALE * 0.20189651799465540848, [33] = FSCALE * 0.19204990862075411423, [34] = FSCALE * 0.18268352405273465022, [35] = FSCALE * 0.17377394345044512668, [36] = FSCALE * 0.16529888822158653829, [37] = FSCALE * 0.15723716631362761621, [38] = FSCALE * 0.14956861922263505264, [39] = FSCALE * 0.14227407158651357185, [40] = FSCALE * 0.13533528323661269189, [41] = FSCALE * 0.12873490358780421886, [42] = FSCALE * 0.12245642825298191021, [43] = FSCALE * 0.11648415777349695786, [44] = FSCALE * 0.11080315836233388333, [45] = FSCALE * 0.10539922456186433678, [46] = FSCALE * 0.10025884372280373372, [47] = FSCALE * 0.09536916221554961888, [48] = FSCALE * 0.09071795328941250337, [49] = FSCALE * 0.08629358649937051097, [50] = FSCALE * 0.08208499862389879516, [51] = FSCALE * 0.07808166600115315231, [52] = FSCALE * 0.07427357821433388042, [53] = FSCALE * 0.07065121306042958674, [54] = FSCALE * 0.06720551273974976512, [55] = FSCALE * 0.06392786120670757270, [56] = FSCALE * 0.06081006262521796499, [57] = FSCALE * 0.05784432087483846296, [58] = FSCALE * 0.05502322005640722902, [59] = FSCALE * 0.05233970594843239308, [60] = FSCALE * 0.04978706836786394297, [61] = FSCALE * 0.04735892439114092119, [62] = FSCALE * 0.04504920239355780606, [63] = FSCALE * 0.04285212686704017991, [64] = FSCALE * 0.04076220397836621516, [65] = FSCALE * 0.03877420783172200988, [66] = FSCALE * 0.03688316740124000544, [67] = FSCALE * 0.03508435410084502588, [68] = FSCALE * 0.03337326996032607948, [69] = FSCALE * 0.03174563637806794323, [70] = FSCALE * 0.03019738342231850073, [71] = FSCALE * 0.02872463965423942912, [72] = FSCALE * 0.02732372244729256080, [73] = FSCALE * 0.02599112877875534358, [74] = FSCALE * 0.02472352647033939120, [75] = FSCALE * 0.02351774585600910823, [76] = FSCALE * 0.02237077185616559577, [77] = FSCALE * 0.02127973643837716938, [78] = FSCALE * 0.02024191144580438847, [79] = FSCALE * 0.01925470177538692429, [80] = FSCALE * 0.01831563888873418029, [81] = FSCALE * 0.01742237463949351138, [82] = FSCALE * 0.01657267540176124754, [83] = FSCALE * 0.01576441648485449082, [84] = FSCALE * 0.01499557682047770621, [85] = FSCALE * 0.01426423390899925527, [86] = FSCALE * 0.01356855901220093175, [87] = FSCALE * 0.01290681258047986886, [88] = FSCALE * 0.01227733990306844117, [89] = FSCALE * 0.01167856697039544521, [90] = FSCALE * 0.01110899653824230649, [91] = FSCALE * 0.01056720438385265337, [92] = FSCALE * 0.01005183574463358164, [93] = FSCALE * 0.00956160193054350793, [94] = FSCALE * 0.00909527710169581709, [95] = FSCALE * 0.00865169520312063417, [96] = FSCALE * 0.00822974704902002884, [97] = FSCALE * 0.00782837754922577143, [98] = FSCALE * 0.00744658307092434051, [99] = FSCALE * 0.00708340892905212004, [100] = FSCALE * 0.00673794699908546709, [101] = FSCALE * 0.00640933344625638184, [102] = FSCALE * 0.00609674656551563610, [103] = FSCALE * 0.00579940472684214321, [104] = FSCALE * 0.00551656442076077241, [105] = FSCALE * 0.00524751839918138427, [106] = FSCALE * 0.00499159390691021621, [107] = FSCALE * 0.00474815099941147558, [108] = FSCALE * 0.00451658094261266798, [109] = FSCALE * 0.00429630469075234057, [110] = FSCALE * 0.00408677143846406699, }; #endif #define CCPU_EXP_MAX 110 /* * This function is analogical to the getpcpu() function in the ps(1) command. * They should both calculate in the same way so that the racct %cpu * calculations are consistent with the values showed by the ps(1) tool. * The calculations are more complex in the 4BSD scheduler because of the value * of the ccpu variable. In ULE it is defined to be zero which saves us some * work. */ static uint64_t racct_getpcpu(struct proc *p, u_int pcpu) { u_int swtime; #ifdef SCHED_4BSD fixpt_t pctcpu, pctcpu_next; #endif #ifdef SMP struct pcpu *pc; int found; #endif fixpt_t p_pctcpu; struct thread *td; ASSERT_RACCT_ENABLED(); /* * If the process is swapped out, we count its %cpu usage as zero. * This behaviour is consistent with the userland ps(1) tool. */ if ((p->p_flag & P_INMEM) == 0) return (0); swtime = (ticks - p->p_swtick) / hz; /* * For short-lived processes, the sched_pctcpu() returns small * values even for cpu intensive processes. Therefore we use * our own estimate in this case. */ if (swtime < RACCT_PCPU_SECS) return (pcpu); p_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { if (td == PCPU_GET(idlethread)) continue; #ifdef SMP found = 0; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (td == pc->pc_idlethread) { found = 1; break; } } if (found) continue; #endif thread_lock(td); #ifdef SCHED_4BSD pctcpu = sched_pctcpu(td); /* Count also the yet unfinished second. */ pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; pctcpu_next += sched_pctcpu_delta(td); p_pctcpu += max(pctcpu, pctcpu_next); #else /* * In ULE the %cpu statistics are updated on every * sched_pctcpu() call. So special calculations to * account for the latest (unfinished) second are * not needed. */ p_pctcpu += sched_pctcpu(td); #endif thread_unlock(td); } #ifdef SCHED_4BSD if (swtime <= CCPU_EXP_MAX) return ((100 * (uint64_t)p_pctcpu * 1000000) / (FSCALE - ccpu_exp[swtime])); #endif return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); } static void racct_add_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); dest->r_resources[i] += src->r_resources[i]; } } static void racct_sub_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); KASSERT(src->r_resources[i] <= dest->r_resources[i], ("%s: resource %d propagation meltdown: src > dest", __func__, i)); } if (RACCT_CAN_DROP(i)) { dest->r_resources[i] -= src->r_resources[i]; if (dest->r_resources[i] < 0) dest->r_resources[i] = 0; } } } void racct_create(struct racct **racctp) { if (!racct_enable) return; SDT_PROBE1(racct, , racct, create, racctp); KASSERT(*racctp == NULL, ("racct already allocated")); *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); } static void racct_destroy_locked(struct racct **racctp) { struct racct *racct; int i; ASSERT_RACCT_ENABLED(); SDT_PROBE1(racct, , racct, destroy, racctp); RACCT_LOCK_ASSERT(); KASSERT(racctp != NULL, ("NULL racctp")); KASSERT(*racctp != NULL, ("NULL racct")); racct = *racctp; for (i = 0; i <= RACCT_MAX; i++) { if (RACCT_IS_SLOPPY(i)) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; KASSERT(racct->r_resources[i] == 0, ("destroying non-empty racct: " "%ju allocated for resource %d\n", racct->r_resources[i], i)); } uma_zfree(racct_zone, racct); *racctp = NULL; } void racct_destroy(struct racct **racct) { if (!racct_enable) return; RACCT_LOCK(); racct_destroy_locked(racct); RACCT_UNLOCK(); } /* * Increase consumption of 'resource' by 'amount' for 'racct', * but not its parents. Differently from other cases, 'amount' here * may be less than zero. */ static void racct_adjust_resource(struct racct *racct, int resource, int64_t amount) { ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(racct != NULL, ("NULL racct")); racct->r_resources[resource] += amount; if (racct->r_resources[resource] < 0) { KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } /* * There are some cases where the racct %cpu resource would grow * beyond 100% per core. For example in racct_proc_exit() we add * the process %cpu usage to the ucred racct containers. If too * many processes terminated in a short time span, the ucred %cpu * resource could grow too much. Also, the 4BSD scheduler sometimes * returns for a thread more than 100% cpu usage. So we set a sane * boundary here to 100% * the maximum number of CPUs. */ if ((resource == RACCT_PCTCPU) && (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; } static int racct_add_locked(struct proc *p, int resource, uint64_t amount, int force) { #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef RCTL error = rctl_enforce(p, resource, amount); if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); return (error); } #endif racct_adjust_resource(p->p_racct, resource, amount); racct_add_cred_locked(p->p_ucred, resource, amount); return (0); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. */ int racct_add(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); SDT_PROBE3(racct, , rusage, add, p, resource, amount); RACCT_LOCK(); error = racct_add_locked(p, resource, amount, 0); RACCT_UNLOCK(); return (error); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Doesn't check for limits and never fails. */ void racct_add_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); RACCT_LOCK(); racct_add_locked(p, resource, amount, 1); RACCT_UNLOCK(); } static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); } /* * Increase allocation of 'resource' by 'amount' for credential 'cred'. * Doesn't check for limits and never fails. */ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); RACCT_LOCK(); racct_add_cred_locked(cred, resource, amount); RACCT_UNLOCK(); } /* * Account for disk IO resource consumption. Checks for limits, * but never fails, due to disk limits being undeniable. */ void racct_add_buf(struct proc *p, const struct buf *bp, int is_write) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write); RACCT_LOCK(); if (is_write) { racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1); racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1); } else { racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1); racct_add_locked(curproc, RACCT_READIOPS, 1, 1); } RACCT_UNLOCK(); } static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { int64_t old_amount, decayed_amount, diff_proc, diff_cred; #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); old_amount = p->p_racct->r_resources[resource]; /* * The diffs may be negative. */ diff_proc = amount - old_amount; if (resource == RACCT_PCTCPU) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, resource)); #endif #ifdef RCTL if (diff_proc > 0) { error = rctl_enforce(p, resource, diff_proc); if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, set__failure, p, resource, amount); return (error); } } #endif racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); else if (diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); return (0); } /* * Set allocation of 'resource' to 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. * * Note that decreasing the allocation always returns 0, * even if it's above the limit. */ int racct_set_unlocked(struct proc *p, int resource, uint64_t amount) { int error; ASSERT_RACCT_ENABLED(); PROC_LOCK(p); error = racct_set(p, resource, amount); PROC_UNLOCK(p); return (error); } int racct_set(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); SDT_PROBE3(racct, , rusage, set__force, p, resource, amount); RACCT_LOCK(); error = racct_set_locked(p, resource, amount, 0); RACCT_UNLOCK(); return (error); } void racct_set_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, set, p, resource, amount); RACCT_LOCK(); racct_set_locked(p, resource, amount, 1); RACCT_UNLOCK(); } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * not matter. */ uint64_t racct_get_limit(struct proc *p, int resource) { #ifdef RCTL uint64_t available; if (!racct_enable) return (UINT64_MAX); RACCT_LOCK(); available = rctl_get_limit(p, resource); RACCT_UNLOCK(); return (available); #else return (UINT64_MAX); #endif } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * matter. */ uint64_t racct_get_available(struct proc *p, int resource) { #ifdef RCTL uint64_t available; if (!racct_enable) return (UINT64_MAX); RACCT_LOCK(); available = rctl_get_available(p, resource); RACCT_UNLOCK(); return (available); #else return (UINT64_MAX); #endif } /* * Returns amount of the %cpu resource that process 'p' can add to its %cpu * utilization. Adding more than that would lead to the process being * throttled. */ static int64_t racct_pcpu_available(struct proc *p) { #ifdef RCTL uint64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK(); available = rctl_pcpu_available(p); RACCT_UNLOCK(); return (available); #else return (INT64_MAX); #endif } /* * Decrease allocation of 'resource' by 'amount' for process 'p'. */ void racct_sub(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, sub, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(RACCT_CAN_DROP(resource), ("%s: called for non-droppable resource %d", __func__, resource)); RACCT_LOCK(); KASSERT(amount <= p->p_racct->r_resources[resource], ("%s: freeing %ju of resource %d, which is more " "than allocated %jd for %s (pid %d)", __func__, amount, resource, (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); racct_adjust_resource(p->p_racct, resource, -amount); racct_sub_cred_locked(p->p_ucred, resource, amount); RACCT_UNLOCK(); } static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, -amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); } /* * Decrease allocation of 'resource' by 'amount' for credential 'cred'. */ void racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); #ifdef notyet KASSERT(RACCT_CAN_DROP(resource), ("%s: called for resource %d which can not drop", __func__, resource)); #endif RACCT_LOCK(); racct_sub_cred_locked(cred, resource, amount); RACCT_UNLOCK(); } /* * Inherit resource usage information from the parent process. */ int racct_proc_fork(struct proc *parent, struct proc *child) { int i, error = 0; if (!racct_enable) return (0); /* * Create racct for the child process. */ racct_create(&child->p_racct); PROC_LOCK(parent); PROC_LOCK(child); RACCT_LOCK(); #ifdef RCTL error = rctl_proc_fork(parent, child); if (error != 0) goto out; #endif /* Init process cpu time. */ child->p_prev_runtime = 0; child->p_throttled = 0; /* * Inherit resource usage. */ for (i = 0; i <= RACCT_MAX; i++) { if (parent->p_racct->r_resources[i] == 0 || !RACCT_IS_INHERITABLE(i)) continue; error = racct_set_locked(child, i, parent->p_racct->r_resources[i], 0); if (error != 0) goto out; } error = racct_add_locked(child, RACCT_NPROC, 1, 0); error += racct_add_locked(child, RACCT_NTHR, 1, 0); out: RACCT_UNLOCK(); PROC_UNLOCK(child); PROC_UNLOCK(parent); if (error != 0) racct_proc_exit(child); return (error); } /* * Called at the end of fork1(), to handle rules that require the process * to be fully initialized. */ void racct_proc_fork_done(struct proc *child) { if (!racct_enable) return; #ifdef RCTL PROC_LOCK(child); RACCT_LOCK(); rctl_enforce(child, RACCT_NPROC, 0); rctl_enforce(child, RACCT_NTHR, 0); RACCT_UNLOCK(); PROC_UNLOCK(child); #endif } void racct_proc_exit(struct proc *p) { struct timeval wallclock; uint64_t pct_estimate, pct, runtime; int i; if (!racct_enable) return; PROC_LOCK(p); /* * We don't need to calculate rux, proc_reap() has already done this. */ runtime = cputick2usec(p->p_rux.rux_runtime); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); racct_set_locked(p, RACCT_CPU, runtime, 0); racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, ("process reaped with %ju allocated for RSS\n", p->p_racct->r_resources[RACCT_RSS])); for (i = 0; i <= RACCT_MAX; i++) { if (p->p_racct->r_resources[i] == 0) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; racct_set_locked(p, i, 0, 0); } #ifdef RCTL rctl_racct_release(p->p_racct); #endif racct_destroy_locked(&p->p_racct); RACCT_UNLOCK(); PROC_UNLOCK(p); } /* * Called after credentials change, to move resource utilisation * between raccts. */ void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred) { struct uidinfo *olduip, *newuip; struct loginclass *oldlc, *newlc; struct prison *oldpr, *newpr, *pr; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_OWNED); newuip = newcred->cr_ruidinfo; olduip = oldcred->cr_ruidinfo; newlc = newcred->cr_loginclass; oldlc = oldcred->cr_loginclass; newpr = newcred->cr_prison; oldpr = oldcred->cr_prison; RACCT_LOCK(); if (newuip != olduip) { racct_sub_racct(olduip->ui_racct, p->p_racct); racct_add_racct(newuip->ui_racct, p->p_racct); } if (newlc != oldlc) { racct_sub_racct(oldlc->lc_racct, p->p_racct); racct_add_racct(newlc->lc_racct, p->p_racct); } if (newpr != oldpr) { for (pr = oldpr; pr != NULL; pr = pr->pr_parent) racct_sub_racct(pr->pr_prison_racct->prr_racct, p->p_racct); for (pr = newpr; pr != NULL; pr = pr->pr_parent) racct_add_racct(pr->pr_prison_racct->prr_racct, p->p_racct); } RACCT_UNLOCK(); } void racct_move(struct racct *dest, struct racct *src) { ASSERT_RACCT_ENABLED(); RACCT_LOCK(); racct_add_racct(dest, src); racct_sub_racct(src, src); RACCT_UNLOCK(); } void racct_proc_throttled(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK(p); while (p->p_throttled != 0) { msleep(p->p_racct, &p->p_mtx, 0, "racct", p->p_throttled < 0 ? 0 : p->p_throttled); if (p->p_throttled > 0) p->p_throttled = 0; } PROC_UNLOCK(p); } /* * Make the process sleep in userret() for 'timeout' ticks. Setting * timeout to -1 makes it sleep until woken up by racct_proc_wakeup(). */ void racct_proc_throttle(struct proc *p, int timeout) { struct thread *td; #ifdef SMP int cpuid; #endif KASSERT(timeout != 0, ("timeout %d", timeout)); ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); /* * Do not block kernel processes. Also do not block processes with * low %cpu utilization to improve interactivity. */ if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) return; if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) return; p->p_throttled = timeout; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_flags |= TDF_ASTPENDING; switch (td->td_state) { case TDS_RUNQ: /* * If the thread is on the scheduler run-queue, we can * not just remove it from there. So we set the flag * TDF_NEEDRESCHED for the thread, so that once it is * running, it is taken off the cpu as soon as possible. */ td->td_flags |= TDF_NEEDRESCHED; break; case TDS_RUNNING: /* * If the thread is running, we request a context * switch for it by setting the TDF_NEEDRESCHED flag. */ td->td_flags |= TDF_NEEDRESCHED; #ifdef SMP cpuid = td->td_oncpu; if ((cpuid != NOCPU) && (td != curthread)) ipi_cpu(cpuid, IPI_AST); #endif break; default: break; } thread_unlock(td); } } static void racct_proc_wakeup(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_throttled != 0) { p->p_throttled = 0; wakeup(p->p_racct); } } static void racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) { int64_t r_old, r_new; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); #ifdef RCTL rctl_throttle_decay(racct, RACCT_READBPS); rctl_throttle_decay(racct, RACCT_WRITEBPS); rctl_throttle_decay(racct, RACCT_READIOPS); rctl_throttle_decay(racct, RACCT_WRITEIOPS); #endif r_old = racct->r_resources[RACCT_PCTCPU]; /* If there is nothing to decay, just exit. */ if (r_old <= 0) return; r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; racct->r_resources[RACCT_PCTCPU] = r_new; } static void racct_decay_pre(void) { RACCT_LOCK(); } static void racct_decay_post(void) { RACCT_UNLOCK(); } static void racct_decay(void) { ASSERT_RACCT_ENABLED(); ui_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); prison_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); } static void racctd(void) { struct thread *td; struct proc *p; struct timeval wallclock; uint64_t pct, pct_estimate, runtime; ASSERT_RACCT_ENABLED(); for (;;) { racct_decay(); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { if (p->p_state == PRS_ZOMBIE) racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); continue; } microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) ruxagg(p, td); runtime = cputick2usec(p->p_rux.rux_runtime); PROC_STATUNLOCK(p); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif p->p_prev_runtime = runtime; if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); #ifdef RCTL rctl_throttle_decay(p->p_racct, RACCT_READBPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); rctl_throttle_decay(p->p_racct, RACCT_READIOPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); #endif racct_set_locked(p, RACCT_PCTCPU, pct, 1); racct_set_locked(p, RACCT_CPU, runtime, 0); racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec, 0); RACCT_UNLOCK(); PROC_UNLOCK(p); } /* * To ensure that processes are throttled in a fair way, we need * to iterate over all processes again and check the limits * for %cpu resource only after ucred racct containers have been * properly filled. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); continue; } if (racct_pcpu_available(p) <= 0) { if (p->p_racct->r_resources[RACCT_PCTCPU] > pcpu_threshold) racct_proc_throttle(p, -1); } else if (p->p_throttled == -1) { racct_proc_wakeup(p); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); pause("-", hz); } } static struct kproc_desc racctd_kp = { "racctd", racctd, NULL }; static void racctd_init(void) { if (!racct_enable) return; kproc_start(&racctd_kp); } SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void racct_init(void) { if (!racct_enable) return; racct_zone = uma_zcreate("racct", sizeof(struct racct), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * XXX: Move this somewhere. */ prison0.pr_prison_racct = prison_racct_find("0"); } SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); #endif /* !RACCT */ diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c index 90e2f7832691..f4356f802f0c 100644 --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -1,2247 +1,2246 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 The FreeBSD Foundation * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #ifndef RACCT #error "The RCTL option requires the RACCT option" #endif FEATURE(rctl, "Resource Limits"); #define HRF_DEFAULT 0 #define HRF_DONT_INHERIT 1 #define HRF_DONT_ACCUMULATE 2 #define RCTL_MAX_INBUFSIZE 4 * 1024 #define RCTL_MAX_OUTBUFSIZE 16 * 1024 * 1024 #define RCTL_LOG_BUFSIZE 128 #define RCTL_PCPU_SHIFT (10 * 1000000) static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; static int rctl_log_rate_limit = 10; static int rctl_devctl_rate_limit = 10; /* * Values below are initialized in rctl_init(). */ static int rctl_throttle_min = -1; static int rctl_throttle_max = -1; static int rctl_throttle_pct = -1; static int rctl_throttle_pct2 = -1; static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, &rctl_maxbufsize, 0, "Maximum output buffer size"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW, &rctl_log_rate_limit, 0, "Maximum number of log messages per second"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN, &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second"); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_min_sysctl, "IU", "Shortest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_max_sysctl, "IU", "Longest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_pct_sysctl, "IU", "Throttling penalty for process consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_pct2_sysctl, "IU", "Throttling penalty for container consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2); /* * 'rctl_rule_link' connects a rule with every racct it's related to. * For example, rule 'user:X:openfiles:deny=N/process' is linked * with uidinfo for user X, and to each process of that user. */ struct rctl_rule_link { LIST_ENTRY(rctl_rule_link) rrl_next; struct rctl_rule *rrl_rule; int rrl_exceeded; }; struct dict { const char *d_name; int d_value; }; static struct dict subjectnames[] = { { "process", RCTL_SUBJECT_TYPE_PROCESS }, { "user", RCTL_SUBJECT_TYPE_USER }, { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS }, { "jail", RCTL_SUBJECT_TYPE_JAIL }, { NULL, -1 }}; static struct dict resourcenames[] = { { "cputime", RACCT_CPU }, { "datasize", RACCT_DATA }, { "stacksize", RACCT_STACK }, { "coredumpsize", RACCT_CORE }, { "memoryuse", RACCT_RSS }, { "memorylocked", RACCT_MEMLOCK }, { "maxproc", RACCT_NPROC }, { "openfiles", RACCT_NOFILE }, { "vmemoryuse", RACCT_VMEM }, { "pseudoterminals", RACCT_NPTS }, { "swapuse", RACCT_SWAP }, { "nthr", RACCT_NTHR }, { "msgqqueued", RACCT_MSGQQUEUED }, { "msgqsize", RACCT_MSGQSIZE }, { "nmsgq", RACCT_NMSGQ }, { "nsem", RACCT_NSEM }, { "nsemop", RACCT_NSEMOP }, { "nshm", RACCT_NSHM }, { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, { "readbps", RACCT_READBPS }, { "writebps", RACCT_WRITEBPS }, { "readiops", RACCT_READIOPS }, { "writeiops", RACCT_WRITEIOPS }, { NULL, -1 }}; static struct dict actionnames[] = { { "sighup", RCTL_ACTION_SIGHUP }, { "sigint", RCTL_ACTION_SIGINT }, { "sigquit", RCTL_ACTION_SIGQUIT }, { "sigill", RCTL_ACTION_SIGILL }, { "sigtrap", RCTL_ACTION_SIGTRAP }, { "sigabrt", RCTL_ACTION_SIGABRT }, { "sigemt", RCTL_ACTION_SIGEMT }, { "sigfpe", RCTL_ACTION_SIGFPE }, { "sigkill", RCTL_ACTION_SIGKILL }, { "sigbus", RCTL_ACTION_SIGBUS }, { "sigsegv", RCTL_ACTION_SIGSEGV }, { "sigsys", RCTL_ACTION_SIGSYS }, { "sigpipe", RCTL_ACTION_SIGPIPE }, { "sigalrm", RCTL_ACTION_SIGALRM }, { "sigterm", RCTL_ACTION_SIGTERM }, { "sigurg", RCTL_ACTION_SIGURG }, { "sigstop", RCTL_ACTION_SIGSTOP }, { "sigtstp", RCTL_ACTION_SIGTSTP }, { "sigchld", RCTL_ACTION_SIGCHLD }, { "sigttin", RCTL_ACTION_SIGTTIN }, { "sigttou", RCTL_ACTION_SIGTTOU }, { "sigio", RCTL_ACTION_SIGIO }, { "sigxcpu", RCTL_ACTION_SIGXCPU }, { "sigxfsz", RCTL_ACTION_SIGXFSZ }, { "sigvtalrm", RCTL_ACTION_SIGVTALRM }, { "sigprof", RCTL_ACTION_SIGPROF }, { "sigwinch", RCTL_ACTION_SIGWINCH }, { "siginfo", RCTL_ACTION_SIGINFO }, { "sigusr1", RCTL_ACTION_SIGUSR1 }, { "sigusr2", RCTL_ACTION_SIGUSR2 }, { "sigthr", RCTL_ACTION_SIGTHR }, { "deny", RCTL_ACTION_DENY }, { "log", RCTL_ACTION_LOG }, { "devctl", RCTL_ACTION_DEVCTL }, { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; static void rctl_init(void); SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL); static uma_zone_t rctl_rule_zone; static uma_zone_t rctl_rule_link_zone; static int rctl_rule_fully_specified(const struct rctl_rule *rule); static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule); static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits"); static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_min; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 1 || val > rctl_throttle_max) return (EINVAL); RACCT_LOCK(); rctl_throttle_min = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_max; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < rctl_throttle_min) return (EINVAL); RACCT_LOCK(); rctl_throttle_max = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct2; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct2 = val; RACCT_UNLOCK(); return (0); } static const char * rctl_subject_type_name(int subject) { int i; for (i = 0; subjectnames[i].d_name != NULL; i++) { if (subjectnames[i].d_value == subject) return (subjectnames[i].d_name); } panic("rctl_subject_type_name: unknown subject type %d", subject); } static const char * rctl_action_name(int action) { int i; for (i = 0; actionnames[i].d_name != NULL; i++) { if (actionnames[i].d_value == action) return (actionnames[i].d_name); } panic("rctl_action_name: unknown action %d", action); } const char * rctl_resource_name(int resource) { int i; for (i = 0; resourcenames[i].d_name != NULL; i++) { if (resourcenames[i].d_value == resource) return (resourcenames[i].d_name); } panic("rctl_resource_name: unknown resource %d", resource); } static struct racct * rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) { struct ucred *cred = p->p_ucred; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); switch (rule->rr_per) { case RCTL_SUBJECT_TYPE_PROCESS: return (p->p_racct); case RCTL_SUBJECT_TYPE_USER: return (cred->cr_ruidinfo->ui_racct); case RCTL_SUBJECT_TYPE_LOGINCLASS: return (cred->cr_loginclass->lc_racct); case RCTL_SUBJECT_TYPE_JAIL: return (cred->cr_prison->pr_prison_racct->prr_racct); default: panic("%s: unknown per %d", __func__, rule->rr_per); } } /* * Return the amount of resource that can be allocated by 'p' before * hitting 'rule'. */ static int64_t rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) { const struct racct *racct; int64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); racct = rctl_proc_rule_to_racct(p, rule); available = rule->rr_amount - racct->r_resources[rule->rr_resource]; return (available); } /* * Called every second for proc, uidinfo, loginclass, and jail containers. * If the limit isn't exceeded, it decreases the usage amount to zero. * Otherwise, it decreases it by the value of the limit. This way * resource consumption exceeding the limit "carries over" to the next * period. */ void rctl_throttle_decay(struct racct *racct, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t minavailable; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_THROTTLE) continue; if (rule->rr_amount < minavailable) minavailable = rule->rr_amount; } if (racct->r_resources[resource] < minavailable) { racct->r_resources[resource] = 0; } else { /* * Cap utilization counter at ten times the limit. Otherwise, * if we changed the rule lowering the allowed amount, it could * take unreasonably long time for the accumulated resource * usage to drop. */ if (racct->r_resources[resource] > minavailable * 10) racct->r_resources[resource] = minavailable * 10; racct->r_resources[resource] -= minavailable; } } /* * Special version of rctl_get_available() for the %CPU resource. * We slightly cheat here and return less than we normally would. */ int64_t rctl_pcpu_available(const struct proc *p) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, limit; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; limit = 0; LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != RACCT_PCTCPU) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) { minavailable = available; limit = rule->rr_amount; } } /* * Return slightly less than actual value of the available * %cpu resource. This makes %cpu throttling more aggressive * and lets us act sooner than the limits are already exceeded. */ if (limit != 0) { if (limit > 2 * RCTL_PCPU_SHIFT) minavailable -= RCTL_PCPU_SHIFT; else minavailable -= (limit / 2); } return (minavailable); } static uint64_t xadd(uint64_t a, uint64_t b) { uint64_t c; c = a + b; /* * Detect overflow. */ if (c < a || c < b) return (UINT64_MAX); return (c); } static uint64_t xmul(uint64_t a, uint64_t b) { if (b != 0 && a > UINT64_MAX / b) return (UINT64_MAX); return (a * b); } /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should * be denied, 0 otherwise. */ int rctl_enforce(struct proc *p, int resource, uint64_t amount) { static struct timeval log_lasttime, devctl_lasttime; static int log_curtime = 0, devctl_curtime = 0; struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; char *buf; int64_t available; uint64_t sleep_ms, sleep_ratio; int should_deny = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; available = rctl_available_resource(p, rule); if (available >= (int64_t)amount) { link->rrl_exceeded = 0; continue; } switch (rule->rr_action) { case RCTL_ACTION_DENY: should_deny = 1; continue; case RCTL_ACTION_LOG: /* * If rrl_exceeded != 0, it means we've already * logged a warning for this process. */ if (link->rrl_exceeded != 0) continue; /* * If the process state is not fully initialized yet, * we can't access most of the required fields, e.g. * p->p_comm. This happens when called from fork1(). * Ignore this rule for now; it will be processed just * after fork, when called from racct_proc_fork_done(). */ if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&log_lasttime, &log_curtime, rctl_log_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); rctl_rule_to_sbuf(&sb, rule); sbuf_finish(&sb); printf("rctl: rule \"%s\" matched by pid %d " "(%s), uid %d, jail %s\n", sbuf_data(&sb), p->p_pid, p->p_comm, p->p_ucred->cr_uid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_DEVCTL: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, rctl_devctl_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); sbuf_printf(&sb, "rule="); rctl_rule_to_sbuf(&sb, rule); sbuf_printf(&sb, " pid=%d ruid=%d jail=%s", p->p_pid, p->p_ucred->cr_ruid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_finish(&sb); devctl_notify("RCTL", "rule", "matched", sbuf_data(&sb)); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_THROTTLE: if (p->p_state != PRS_NORMAL) continue; if (rule->rr_amount == 0) { racct_proc_throttle(p, rctl_throttle_max); continue; } /* * Make the process sleep for a fraction of second * proportional to the ratio of process' resource * utilization compared to the limit. The point is * to penalize resource hogs: processes that consume * more of the available resources sleep for longer. * * We're trying to defer division until the very end, * to minimize the rounding effects. The following * calculation could have been written in a clearer * way like this: * * sleep_ms = hz * p->p_racct->r_resources[resource] / * rule->rr_amount; * sleep_ms *= rctl_throttle_pct / 100; * if (sleep_ms < rctl_throttle_min) * sleep_ms = rctl_throttle_min; * */ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; if (sleep_ms < rctl_throttle_min * rule->rr_amount) sleep_ms = rctl_throttle_min * rule->rr_amount; /* * Multiply that by the ratio of the resource * consumption for the container compared to the limit, * squared. In other words, a process in a container * that is two times over the limit will be throttled * four times as much for hitting the same rule. The * point is to penalize processes more if the container * itself (eg certain UID or jail) is above the limit. */ if (available < 0) sleep_ratio = -available / rule->rr_amount; else sleep_ratio = 0; sleep_ratio = xmul(sleep_ratio, sleep_ratio); sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); /* * Finally the division. */ sleep_ms /= rule->rr_amount; if (sleep_ms > rctl_throttle_max) sleep_ms = rctl_throttle_max; #if 0 printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n", __func__, p->p_pid, p->p_comm, p->p_racct->r_resources[resource], rule->rr_amount, (uintmax_t)sleep_ms, (uintmax_t)sleep_ratio, (intmax_t)available); #endif KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n", __func__, (uintmax_t)sleep_ms, rctl_throttle_min)); racct_proc_throttle(p, sleep_ms); continue; default: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; KASSERT(rule->rr_action > 0 && rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, ("rctl_enforce: unknown action %d", rule->rr_action)); /* * We're using the fact that RCTL_ACTION_SIG* values * are equal to their counterparts from sys/signal.h. */ kern_psignal(p, rule->rr_action); link->rrl_exceeded = 1; continue; } } if (should_deny) { /* * Return fake error code; the caller should change it * into one proper for the situation - EFSIZ, ENOMEM etc. */ return (EDOOFUS); } return (0); } uint64_t rctl_get_limit(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; uint64_t amount = UINT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; if (rule->rr_amount < amount) amount = rule->rr_amount; } return (amount); } uint64_t rctl_get_available(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, allocated; minavailable = INT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) minavailable = available; } /* * XXX: Think about this _hard_. */ allocated = p->p_racct->r_resources[resource]; if (minavailable < INT64_MAX - allocated) minavailable += allocated; if (minavailable < 0) minavailable = 0; return (minavailable); } static int rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter) { ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_subject_type != filter->rr_subject_type) return (0); switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (filter->rr_subject.rs_proc != NULL && rule->rr_subject.rs_proc != filter->rr_subject.rs_proc) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (filter->rr_subject.rs_uip != NULL && rule->rr_subject.rs_uip != filter->rr_subject.rs_uip) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (filter->rr_subject.rs_loginclass != NULL && rule->rr_subject.rs_loginclass != filter->rr_subject.rs_loginclass) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (filter->rr_subject.rs_prison_racct != NULL && rule->rr_subject.rs_prison_racct != filter->rr_subject.rs_prison_racct) return (0); break; default: panic("rctl_rule_matches: unknown subject type %d", filter->rr_subject_type); } } if (filter->rr_resource != RACCT_UNDEFINED) { if (rule->rr_resource != filter->rr_resource) return (0); } if (filter->rr_action != RCTL_ACTION_UNDEFINED) { if (rule->rr_action != filter->rr_action) return (0); } if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) { if (rule->rr_amount != filter->rr_amount) return (0); } if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_per != filter->rr_per) return (0); } return (1); } static int str2value(const char *str, int *value, struct dict *table) { int i; if (value == NULL) return (EINVAL); for (i = 0; table[i].d_name != NULL; i++) { if (strcasecmp(table[i].d_name, str) == 0) { *value = table[i].d_value; return (0); } } return (EINVAL); } static int str2id(const char *str, id_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); return (0); } static int str2int64(const char *str, int64_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); if (*value < 0) return (ERANGE); return (0); } /* * Connect the rule to the racct, increasing refcount for the rule. */ static void rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rctl_rule_acquire(rule); link = uma_zalloc(rctl_rule_link_zone, M_WAITOK); link->rrl_rule = rule; link->rrl_exceeded = 0; RACCT_LOCK(); LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); RACCT_UNLOCK(); } static int rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); RACCT_LOCK_ASSERT(); link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT); if (link == NULL) return (ENOMEM); rctl_rule_acquire(rule); link->rrl_rule = rule; link->rrl_exceeded = 0; LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); return (0); } /* * Remove limits for a rules matching the filter and release * the refcounts for the rules, possibly freeing them. Returns * the number of limit structures removed. */ static int rctl_racct_remove_rules(struct racct *racct, const struct rctl_rule *filter) { struct rctl_rule_link *link, *linktmp; int removed = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); removed++; } return (removed); } static void rctl_rule_acquire_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_hold(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uihold(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_hold(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_acquire_subject: unknown subject type %d", rule->rr_subject_type); } } static void rctl_rule_release_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_free(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uifree(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_free(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_release_subject: unknown subject type %d", rule->rr_subject_type); } } struct rctl_rule * rctl_rule_alloc(int flags) { struct rctl_rule *rule; ASSERT_RACCT_ENABLED(); rule = uma_zalloc(rctl_rule_zone, flags); if (rule == NULL) return (NULL); rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_resource = RACCT_UNDEFINED; rule->rr_action = RCTL_ACTION_UNDEFINED; rule->rr_amount = RCTL_AMOUNT_UNDEFINED; refcount_init(&rule->rr_refcount, 1); return (rule); } struct rctl_rule * rctl_rule_duplicate(const struct rctl_rule *rule, int flags) { struct rctl_rule *copy; ASSERT_RACCT_ENABLED(); copy = uma_zalloc(rctl_rule_zone, flags); if (copy == NULL) return (NULL); copy->rr_subject_type = rule->rr_subject_type; copy->rr_subject.rs_proc = rule->rr_subject.rs_proc; copy->rr_subject.rs_uip = rule->rr_subject.rs_uip; copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass; copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct; copy->rr_per = rule->rr_per; copy->rr_resource = rule->rr_resource; copy->rr_action = rule->rr_action; copy->rr_amount = rule->rr_amount; refcount_init(©->rr_refcount, 1); rctl_rule_acquire_subject(copy); return (copy); } void rctl_rule_acquire(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); refcount_acquire(&rule->rr_refcount); } static void rctl_rule_free(void *context, int pending) { struct rctl_rule *rule; rule = (struct rctl_rule *)context; ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0")); /* * We don't need locking here; rule is guaranteed to be inaccessible. */ rctl_rule_release_subject(rule); uma_zfree(rctl_rule_zone, rule); } void rctl_rule_release(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); if (refcount_release(&rule->rr_refcount)) { /* * rctl_rule_release() is often called when iterating * over all the uidinfo structures in the system, * holding uihashtbl_lock. Since rctl_rule_free() * might end up calling uifree(), this would lead * to lock recursion. Use taskqueue to avoid this. */ TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule); taskqueue_enqueue(taskqueue_thread, &rule->rr_task); } } static int rctl_rule_fully_specified(const struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: return (0); case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) return (0); break; default: panic("rctl_rule_fully_specified: unknown subject type %d", rule->rr_subject_type); } if (rule->rr_resource == RACCT_UNDEFINED) return (0); if (rule->rr_action == RCTL_ACTION_UNDEFINED) return (0); if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED) return (0); if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED) return (0); return (1); } static int rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) { struct rctl_rule *rule; char *subjectstr, *subject_idstr, *resourcestr, *actionstr, *amountstr, *perstr; id_t id; int error = 0; ASSERT_RACCT_ENABLED(); rule = rctl_rule_alloc(M_WAITOK); subjectstr = strsep(&rulestr, ":"); subject_idstr = strsep(&rulestr, ":"); resourcestr = strsep(&rulestr, ":"); actionstr = strsep(&rulestr, "=/"); amountstr = strsep(&rulestr, "/"); perstr = rulestr; if (subjectstr == NULL || subjectstr[0] == '\0') rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(subjectstr, &rule->rr_subject_type, subjectnames); if (error != 0) goto out; } if (subject_idstr == NULL || subject_idstr[0] == '\0') { rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; } else { switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: error = EINVAL; goto out; case RCTL_SUBJECT_TYPE_PROCESS: error = str2id(subject_idstr, &id); if (error != 0) goto out; sx_assert(&allproc_lock, SA_LOCKED); rule->rr_subject.rs_proc = pfind(id); if (rule->rr_subject.rs_proc == NULL) { error = ESRCH; goto out; } PROC_UNLOCK(rule->rr_subject.rs_proc); break; case RCTL_SUBJECT_TYPE_USER: error = str2id(subject_idstr, &id); if (error != 0) goto out; rule->rr_subject.rs_uip = uifind(id); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: rule->rr_subject.rs_loginclass = loginclass_find(subject_idstr); if (rule->rr_subject.rs_loginclass == NULL) { error = ENAMETOOLONG; goto out; } break; case RCTL_SUBJECT_TYPE_JAIL: rule->rr_subject.rs_prison_racct = prison_racct_find(subject_idstr); if (rule->rr_subject.rs_prison_racct == NULL) { error = ENAMETOOLONG; goto out; } break; default: panic("rctl_string_to_rule: unknown subject type %d", rule->rr_subject_type); } } if (resourcestr == NULL || resourcestr[0] == '\0') rule->rr_resource = RACCT_UNDEFINED; else { error = str2value(resourcestr, &rule->rr_resource, resourcenames); if (error != 0) goto out; } if (actionstr == NULL || actionstr[0] == '\0') rule->rr_action = RCTL_ACTION_UNDEFINED; else { error = str2value(actionstr, &rule->rr_action, actionnames); if (error != 0) goto out; } if (amountstr == NULL || amountstr[0] == '\0') rule->rr_amount = RCTL_AMOUNT_UNDEFINED; else { error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) { if (rule->rr_amount > INT64_MAX / 1000000) { error = ERANGE; goto out; } rule->rr_amount *= 1000000; } } if (perstr == NULL || perstr[0] == '\0') rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(perstr, &rule->rr_per, subjectnames); if (error != 0) goto out; } out: if (error == 0) *rulep = rule; else rctl_rule_release(rule); return (error); } /* * Link a rule with all the subjects it applies to. */ int rctl_rule_add(struct rctl_rule *rule) { struct proc *p; struct ucred *cred; struct uidinfo *uip; struct prison *pr; struct prison_racct *prr; struct loginclass *lc; struct rctl_rule *rule2; int match; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* * Some rules just don't make sense, like "deny" rule for an undeniable * resource. The exception are the RSS and %CPU resources - they are * not deniable in the racct sense, but the limit is enforced in * a different way. */ if (rule->rr_action == RCTL_ACTION_DENY && !RACCT_IS_DENIABLE(rule->rr_resource) && rule->rr_resource != RACCT_RSS && rule->rr_resource != RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && !RACCT_IS_DECAYING(rule->rr_resource)) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && rule->rr_resource == RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && RACCT_IS_SLOPPY(rule->rr_resource)) { return (EOPNOTSUPP); } /* * Make sure there are no duplicated rules. Also, for the "deny" * rules, remove ones differing only by "amount". */ if (rule->rr_action == RCTL_ACTION_DENY) { rule2 = rctl_rule_duplicate(rule, M_WAITOK); rule2->rr_amount = RCTL_AMOUNT_UNDEFINED; rctl_rule_remove(rule2); rctl_rule_release(rule2); } else rctl_rule_remove(rule); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = rule->rr_subject.rs_proc; KASSERT(p != NULL, ("rctl_rule_add: NULL proc")); rctl_racct_add_rule(p->p_racct, rule); /* * In case of per-process rule, we don't have anything more * to do. */ return (0); case RCTL_SUBJECT_TYPE_USER: uip = rule->rr_subject.rs_uip; KASSERT(uip != NULL, ("rctl_rule_add: NULL uip")); rctl_racct_add_rule(uip->ui_racct, rule); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = rule->rr_subject.rs_loginclass; KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass")); rctl_racct_add_rule(lc->lc_racct, rule); break; case RCTL_SUBJECT_TYPE_JAIL: prr = rule->rr_subject.rs_prison_racct; KASSERT(prr != NULL, ("rctl_rule_add: NULL pr")); rctl_racct_add_rule(prr->prr_racct, rule); break; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } /* * Now go through all the processes and add the new rule to the ones * it applies to. */ sx_assert(&allproc_lock, SA_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { cred = p->p_ucred; switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_USER: if (cred->cr_uidinfo == rule->rr_subject.rs_uip || cred->cr_ruidinfo == rule->rr_subject.rs_uip) break; continue; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (cred->cr_loginclass == rule->rr_subject.rs_loginclass) break; continue; case RCTL_SUBJECT_TYPE_JAIL: match = 0; for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) { match = 1; break; } } if (match) break; continue; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } rctl_racct_add_rule(p->p_racct, rule); } return (0); } static void rctl_rule_pre_callback(void) { RACCT_LOCK(); } static void rctl_rule_post_callback(void) { RACCT_UNLOCK(); } static void rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; int found = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); found += rctl_racct_remove_rules(racct, filter); *((int *)arg3) += found; } /* * Remove all rules that match the filter. */ int rctl_rule_remove(struct rctl_rule *filter) { struct proc *p; int found = 0; ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS && filter->rr_subject.rs_proc != NULL) { p = filter->rr_subject.rs_proc; RACCT_LOCK(); found = rctl_racct_remove_rules(p->p_racct, filter); RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } loginclass_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); ui_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); prison_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); sx_assert(&allproc_lock, SA_LOCKED); RACCT_LOCK(); FOREACH_PROC_IN_SYSTEM(p) { found += rctl_racct_remove_rules(p->p_racct, filter); } RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } /* * Appends a rule to the sbuf. */ static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) { int64_t amount; ASSERT_RACCT_ENABLED(); sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type)); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_proc->p_pid); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_uip->ui_uid); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_loginclass->lc_name); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_prison_racct->prr_name); break; default: panic("rctl_rule_to_sbuf: unknown subject type %d", rule->rr_subject_type); } amount = rule->rr_amount; if (amount != RCTL_AMOUNT_UNDEFINED && RACCT_IS_IN_MILLIONS(rule->rr_resource)) amount /= 1000000; sbuf_printf(sb, "%s:%s=%jd", rctl_resource_name(rule->rr_resource), rctl_action_name(rule->rr_action), amount); if (rule->rr_per != rule->rr_subject_type) sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per)); } /* * Routine used by RCTL syscalls to read in input string. */ static int rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen) { char *str; int error; ASSERT_RACCT_ENABLED(); if (inbuflen <= 0) return (EINVAL); if (inbuflen > RCTL_MAX_INBUFSIZE) return (E2BIG); str = malloc(inbuflen + 1, M_RCTL, M_WAITOK); error = copyinstr(inbufp, str, inbuflen, NULL); if (error != 0) { free(str, M_RCTL); return (error); } *inputstr = str; return (0); } /* * Routine used by RCTL syscalls to write out output string. */ static int rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen) { int error; ASSERT_RACCT_ENABLED(); if (outputsbuf == NULL) return (0); sbuf_finish(outputsbuf); if (outbuflen < sbuf_len(outputsbuf) + 1) { sbuf_delete(outputsbuf); return (ERANGE); } error = copyout(sbuf_data(outputsbuf), outbufp, sbuf_len(outputsbuf) + 1); sbuf_delete(outputsbuf); return (error); } static struct sbuf * rctl_racct_to_sbuf(struct racct *racct, int sloppy) { struct sbuf *sb; int64_t amount; int i; ASSERT_RACCT_ENABLED(); sb = sbuf_new_auto(); for (i = 0; i <= RACCT_MAX; i++) { if (sloppy == 0 && RACCT_IS_SLOPPY(i)) continue; RACCT_LOCK(); amount = racct->r_resources[i]; RACCT_UNLOCK(); if (RACCT_IS_IN_MILLIONS(i)) amount /= 1000000; sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); } sbuf_setpos(sb, sbuf_len(sb) - 1); return (sb); } int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { struct rctl_rule *filter; struct sbuf *outputsbuf = NULL; struct proc *p; struct uidinfo *uip; struct loginclass *lc; struct prison_racct *prr; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RACCT); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = filter->rr_subject.rs_proc; if (p == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0); break; case RCTL_SUBJECT_TYPE_USER: uip = filter->rr_subject.rs_uip; if (uip == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = filter->rr_subject.rs_loginclass; if (lc == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1); break; case RCTL_SUBJECT_TYPE_JAIL: prr = filter->rr_subject.rs_prison_racct; if (prr == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1); break; default: error = EINVAL; } out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); if (error != 0) return (error); error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen); return (error); } static void rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; struct rctl_rule_link *link; struct sbuf *sb = (struct sbuf *)arg3; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; struct proc *p; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RULES); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); FOREACH_PROC_IN_SYSTEM(p) { RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { /* * Non-process rules will be added to the buffer later. * Adding them here would result in duplicated output. */ if (link->rrl_rule->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) continue; if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } RACCT_UNLOCK(); } loginclass_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); ui_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); prison_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_LIMITS); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EOPNOTSUPP); } if (filter->rr_subject.rs_proc == NULL) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); RACCT_LOCK(); LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links, rrl_next) { rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } RACCT_UNLOCK(); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; sbuf_delete(sb); goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { struct rctl_rule *rule; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_ADD_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &rule); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } /* * The 'per' part of a rule is optional. */ if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED && rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) rule->rr_per = rule->rr_subject_type; if (!rctl_rule_fully_specified(rule)) { error = EINVAL; goto out; } error = rctl_rule_add(rule); out: rctl_rule_release(rule); sx_sunlock(&allproc_lock); return (error); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { struct rctl_rule *filter; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_REMOVE_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } error = rctl_rule_remove(filter); rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (error); } /* * Update RCTL rule list after credential change. */ void rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred) { LIST_HEAD(, rctl_rule_link) newrules; struct rctl_rule_link *link, *newlink; struct uidinfo *newuip; struct loginclass *newlc; struct prison_racct *newprr; int rulecnt, i; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_NOTOWNED); newuip = newcred->cr_ruidinfo; newlc = newcred->cr_loginclass; newprr = newcred->cr_prison->pr_prison_racct; LIST_INIT(&newrules); again: /* * First, count the rules that apply to the process with new * credentials. */ rulecnt = 0; RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) rulecnt++; } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) rulecnt++; RACCT_UNLOCK(); /* * Create temporary list. We've dropped the rctl_lock in order * to use M_WAITOK. */ for (i = 0; i < rulecnt; i++) { newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK); newlink->rrl_rule = NULL; newlink->rrl_exceeded = 0; LIST_INSERT_HEAD(&newrules, newlink, rrl_next); } newlink = LIST_FIRST(&newrules); /* * Assign rules to the newly allocated list entries. */ RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } if (rulecnt == 0) { /* * Free the old rule list. */ while (!LIST_EMPTY(&p->p_racct->r_rule_links)) { link = LIST_FIRST(&p->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } /* * Replace lists and we're done. * * XXX: Is there any way to switch list heads instead * of iterating here? */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); LIST_INSERT_HEAD(&p->p_racct->r_rule_links, newlink, rrl_next); } RACCT_UNLOCK(); return; } goaround: RACCT_UNLOCK(); /* * Rule list changed while we were not holding the rctl_lock. * Free the new list and try again. */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); if (newlink->rrl_rule != NULL) rctl_rule_release(newlink->rrl_rule); uma_zfree(rctl_rule_link_zone, newlink); } goto again; } /* * Assign RCTL rules to the newly created process. */ int rctl_proc_fork(struct proc *parent, struct proc *child) { struct rctl_rule *rule; struct rctl_rule_link *link; int error; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent)); LIST_INIT(&child->p_racct->r_rule_links); /* * Go through limits applicable to the parent and assign them * to the child. Rules with 'process' subject have to be duplicated * in order to make their rr_subject point to the new process. */ LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT); if (rule == NULL) goto fail; KASSERT(rule->rr_subject.rs_proc == parent, ("rule->rr_subject.rs_proc != parent")); rule->rr_subject.rs_proc = child; error = rctl_racct_add_rule_locked(child->p_racct, rule); rctl_rule_release(rule); if (error != 0) goto fail; } else { error = rctl_racct_add_rule_locked(child->p_racct, link->rrl_rule); if (error != 0) goto fail; } } return (0); fail: while (!LIST_EMPTY(&child->p_racct->r_rule_links)) { link = LIST_FIRST(&child->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } return (EAGAIN); } /* * Release rules attached to the racct. */ void rctl_racct_release(struct racct *racct) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); while (!LIST_EMPTY(&racct->r_rule_links)) { link = LIST_FIRST(&racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } } static void rctl_init(void) { if (!racct_enable) return; rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rctl_rule_link_zone = uma_zcreate("rctl_rule_link", sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Set default values, making sure not to overwrite the ones * fetched from tunables. Most of those could be set at the * declaration, except for the rctl_throttle_max - we cannot * set it there due to hz not being compile time constant. */ if (rctl_throttle_min < 1) rctl_throttle_min = 1; if (rctl_throttle_max < rctl_throttle_min) rctl_throttle_max = 2 * hz; if (rctl_throttle_pct < 0) rctl_throttle_pct = 100; if (rctl_throttle_pct2 < 0) rctl_throttle_pct2 = 100; } #else /* !RCTL */ int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { return (ENOSYS); } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { return (ENOSYS); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { return (ENOSYS); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { return (ENOSYS); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { return (ENOSYS); } #endif /* !RCTL */ diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c index 18d89f54ca69..aa7060c7cb7f 100644 --- a/sys/kern/kern_time.c +++ b/sys/kern/kern_time.c @@ -1,1836 +1,1835 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_time.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #define MAX_CLOCKS (CLOCK_MONOTONIC+1) #define CPUCLOCK_BIT 0x80000000 #define CPUCLOCK_PROCESS_BIT 0x40000000 #define CPUCLOCK_ID_MASK (~(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT)) #define MAKE_THREAD_CPUCLOCK(tid) (CPUCLOCK_BIT|(tid)) #define MAKE_PROCESS_CPUCLOCK(pid) \ (CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT|(pid)) #define NS_PER_SEC 1000000000 static struct kclock posix_clocks[MAX_CLOCKS]; static uma_zone_t itimer_zone = NULL; /* * Time of day and interval timer support. * * These routines provide the kernel entry points to get and set * the time-of-day and per-process interval timers. Subroutines * here provide support for adding and subtracting timeval structures * and decrementing interval timers, optionally reloading the interval * timers when they expire. */ static int settime(struct thread *, struct timeval *); static void timevalfix(struct timeval *); static int user_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, const struct timespec *ua_rqtp, struct timespec *ua_rmtp); static void itimer_start(void); static int itimer_init(void *, int, int); static void itimer_fini(void *, int); static void itimer_enter(struct itimer *); static void itimer_leave(struct itimer *); static struct itimer *itimer_find(struct proc *, int); static void itimers_alloc(struct proc *); static int realtimer_create(struct itimer *); static int realtimer_gettime(struct itimer *, struct itimerspec *); static int realtimer_settime(struct itimer *, int, struct itimerspec *, struct itimerspec *); static int realtimer_delete(struct itimer *); static void realtimer_clocktime(clockid_t, struct timespec *); static void realtimer_expire(void *); static void realtimer_expire_l(struct itimer *it, bool proc_locked); static int register_posix_clock(int, const struct kclock *); static void itimer_fire(struct itimer *it); static int itimespecfix(struct timespec *ts); #define CLOCK_CALL(clock, call, arglist) \ ((*posix_clocks[clock].call) arglist) SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL); static int settime(struct thread *td, struct timeval *tv) { struct timeval delta, tv1, tv2; static struct timeval maxtime, laststep; struct timespec ts; microtime(&tv1); delta = *tv; timevalsub(&delta, &tv1); /* * If the system is secure, we do not allow the time to be * set to a value earlier than 1 second less than the highest * time we have yet seen. The worst a miscreant can do in * this circumstance is "freeze" time. He couldn't go * back to the past. * * We similarly do not allow the clock to be stepped more * than one second, nor more than once per second. This allows * a miscreant to make the clock march double-time, but no worse. */ if (securelevel_gt(td->td_ucred, 1) != 0) { if (delta.tv_sec < 0 || delta.tv_usec < 0) { /* * Update maxtime to latest time we've seen. */ if (tv1.tv_sec > maxtime.tv_sec) maxtime = tv1; tv2 = *tv; timevalsub(&tv2, &maxtime); if (tv2.tv_sec < -1) { tv->tv_sec = maxtime.tv_sec - 1; printf("Time adjustment clamped to -1 second\n"); } } else { if (tv1.tv_sec == laststep.tv_sec) return (EPERM); if (delta.tv_sec > 1) { tv->tv_sec = tv1.tv_sec + 1; printf("Time adjustment clamped to +1 second\n"); } laststep = *tv; } } ts.tv_sec = tv->tv_sec; ts.tv_nsec = tv->tv_usec * 1000; tc_setclock(&ts); resettodr(); return (0); } #ifndef _SYS_SYSPROTO_H_ struct clock_getcpuclockid2_args { id_t id; int which, clockid_t *clock_id; }; #endif /* ARGSUSED */ int sys_clock_getcpuclockid2(struct thread *td, struct clock_getcpuclockid2_args *uap) { clockid_t clk_id; int error; error = kern_clock_getcpuclockid2(td, uap->id, uap->which, &clk_id); if (error == 0) error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t)); return (error); } int kern_clock_getcpuclockid2(struct thread *td, id_t id, int which, clockid_t *clk_id) { struct proc *p; pid_t pid; lwpid_t tid; int error; switch (which) { case CPUCLOCK_WHICH_PID: if (id != 0) { error = pget(id, PGET_CANSEE | PGET_NOTID, &p); if (error != 0) return (error); PROC_UNLOCK(p); pid = id; } else { pid = td->td_proc->p_pid; } *clk_id = MAKE_PROCESS_CPUCLOCK(pid); return (0); case CPUCLOCK_WHICH_TID: tid = id == 0 ? td->td_tid : id; *clk_id = MAKE_THREAD_CPUCLOCK(tid); return (0); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct clock_gettime_args { clockid_t clock_id; struct timespec *tp; }; #endif /* ARGSUSED */ int sys_clock_gettime(struct thread *td, struct clock_gettime_args *uap) { struct timespec ats; int error; error = kern_clock_gettime(td, uap->clock_id, &ats); if (error == 0) error = copyout(&ats, uap->tp, sizeof(ats)); return (error); } static inline void cputick2timespec(uint64_t runtime, struct timespec *ats) { runtime = cputick2usec(runtime); ats->tv_sec = runtime / 1000000; ats->tv_nsec = runtime % 1000000 * 1000; } void kern_thread_cputime(struct thread *targettd, struct timespec *ats) { uint64_t runtime, curtime, switchtime; if (targettd == NULL) { /* current thread */ spinlock_enter(); switchtime = PCPU_GET(switchtime); curtime = cpu_ticks(); runtime = curthread->td_runtime; spinlock_exit(); runtime += curtime - switchtime; } else { PROC_LOCK_ASSERT(targettd->td_proc, MA_OWNED); thread_lock(targettd); runtime = targettd->td_runtime; thread_unlock(targettd); } cputick2timespec(runtime, ats); } void kern_process_cputime(struct proc *targetp, struct timespec *ats) { uint64_t runtime; struct rusage ru; PROC_LOCK_ASSERT(targetp, MA_OWNED); PROC_STATLOCK(targetp); rufetch(targetp, &ru); runtime = targetp->p_rux.rux_runtime; if (curthread->td_proc == targetp) runtime += cpu_ticks() - PCPU_GET(switchtime); PROC_STATUNLOCK(targetp); cputick2timespec(runtime, ats); } static int get_cputime(struct thread *td, clockid_t clock_id, struct timespec *ats) { struct proc *p, *p2; struct thread *td2; lwpid_t tid; pid_t pid; int error; p = td->td_proc; if ((clock_id & CPUCLOCK_PROCESS_BIT) == 0) { tid = clock_id & CPUCLOCK_ID_MASK; td2 = tdfind(tid, p->p_pid); if (td2 == NULL) return (EINVAL); kern_thread_cputime(td2, ats); PROC_UNLOCK(td2->td_proc); } else { pid = clock_id & CPUCLOCK_ID_MASK; error = pget(pid, PGET_CANSEE, &p2); if (error != 0) return (EINVAL); kern_process_cputime(p2, ats); PROC_UNLOCK(p2); } return (0); } int kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats) { struct timeval sys, user; struct proc *p; p = td->td_proc; switch (clock_id) { case CLOCK_REALTIME: /* Default to precise. */ case CLOCK_REALTIME_PRECISE: nanotime(ats); break; case CLOCK_REALTIME_FAST: getnanotime(ats); break; case CLOCK_VIRTUAL: PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &user, &sys); PROC_STATUNLOCK(p); PROC_UNLOCK(p); TIMEVAL_TO_TIMESPEC(&user, ats); break; case CLOCK_PROF: PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &user, &sys); PROC_STATUNLOCK(p); PROC_UNLOCK(p); timevaladd(&user, &sys); TIMEVAL_TO_TIMESPEC(&user, ats); break; case CLOCK_MONOTONIC: /* Default to precise. */ case CLOCK_MONOTONIC_PRECISE: case CLOCK_UPTIME: case CLOCK_UPTIME_PRECISE: nanouptime(ats); break; case CLOCK_UPTIME_FAST: case CLOCK_MONOTONIC_FAST: getnanouptime(ats); break; case CLOCK_SECOND: ats->tv_sec = time_second; ats->tv_nsec = 0; break; case CLOCK_THREAD_CPUTIME_ID: kern_thread_cputime(NULL, ats); break; case CLOCK_PROCESS_CPUTIME_ID: PROC_LOCK(p); kern_process_cputime(p, ats); PROC_UNLOCK(p); break; default: if ((int)clock_id >= 0) return (EINVAL); return (get_cputime(td, clock_id, ats)); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct clock_settime_args { clockid_t clock_id; const struct timespec *tp; }; #endif /* ARGSUSED */ int sys_clock_settime(struct thread *td, struct clock_settime_args *uap) { struct timespec ats; int error; if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0) return (error); return (kern_clock_settime(td, uap->clock_id, &ats)); } static int allow_insane_settime = 0; SYSCTL_INT(_debug, OID_AUTO, allow_insane_settime, CTLFLAG_RWTUN, &allow_insane_settime, 0, "do not perform possibly restrictive checks on settime(2) args"); int kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats) { struct timeval atv; int error; if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0) return (error); if (clock_id != CLOCK_REALTIME) return (EINVAL); if (!timespecvalid_interval(ats)) return (EINVAL); if (!allow_insane_settime && (ats->tv_sec > 8000ULL * 365 * 24 * 60 * 60 || ats->tv_sec < utc_offset())) return (EINVAL); /* XXX Don't convert nsec->usec and back */ TIMESPEC_TO_TIMEVAL(&atv, ats); error = settime(td, &atv); return (error); } #ifndef _SYS_SYSPROTO_H_ struct clock_getres_args { clockid_t clock_id; struct timespec *tp; }; #endif int sys_clock_getres(struct thread *td, struct clock_getres_args *uap) { struct timespec ts; int error; if (uap->tp == NULL) return (0); error = kern_clock_getres(td, uap->clock_id, &ts); if (error == 0) error = copyout(&ts, uap->tp, sizeof(ts)); return (error); } int kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts) { ts->tv_sec = 0; switch (clock_id) { case CLOCK_REALTIME: case CLOCK_REALTIME_FAST: case CLOCK_REALTIME_PRECISE: case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_FAST: case CLOCK_MONOTONIC_PRECISE: case CLOCK_UPTIME: case CLOCK_UPTIME_FAST: case CLOCK_UPTIME_PRECISE: /* * Round up the result of the division cheaply by adding 1. * Rounding up is especially important if rounding down * would give 0. Perfect rounding is unimportant. */ ts->tv_nsec = NS_PER_SEC / tc_getfrequency() + 1; break; case CLOCK_VIRTUAL: case CLOCK_PROF: /* Accurately round up here because we can do so cheaply. */ ts->tv_nsec = howmany(NS_PER_SEC, hz); break; case CLOCK_SECOND: ts->tv_sec = 1; ts->tv_nsec = 0; break; case CLOCK_THREAD_CPUTIME_ID: case CLOCK_PROCESS_CPUTIME_ID: cputime: /* sync with cputick2usec */ ts->tv_nsec = 1000000 / cpu_tickrate(); if (ts->tv_nsec == 0) ts->tv_nsec = 1000; break; default: if ((int)clock_id < 0) goto cputime; return (EINVAL); } return (0); } int kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt) { return (kern_clock_nanosleep(td, CLOCK_REALTIME, TIMER_RELTIME, rqt, rmt)); } static uint8_t nanowait[MAXCPU]; int kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, const struct timespec *rqt, struct timespec *rmt) { struct timespec ts, now; sbintime_t sbt, sbtt, prec, tmp; time_t over; int error; bool is_abs_real; if (rqt->tv_nsec < 0 || rqt->tv_nsec >= NS_PER_SEC) return (EINVAL); if ((flags & ~TIMER_ABSTIME) != 0) return (EINVAL); switch (clock_id) { case CLOCK_REALTIME: case CLOCK_REALTIME_PRECISE: case CLOCK_REALTIME_FAST: case CLOCK_SECOND: is_abs_real = (flags & TIMER_ABSTIME) != 0; break; case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_PRECISE: case CLOCK_MONOTONIC_FAST: case CLOCK_UPTIME: case CLOCK_UPTIME_PRECISE: case CLOCK_UPTIME_FAST: is_abs_real = false; break; case CLOCK_VIRTUAL: case CLOCK_PROF: case CLOCK_PROCESS_CPUTIME_ID: return (ENOTSUP); case CLOCK_THREAD_CPUTIME_ID: default: return (EINVAL); } do { ts = *rqt; if ((flags & TIMER_ABSTIME) != 0) { if (is_abs_real) td->td_rtcgen = atomic_load_acq_int(&rtc_generation); error = kern_clock_gettime(td, clock_id, &now); KASSERT(error == 0, ("kern_clock_gettime: %d", error)); timespecsub(&ts, &now, &ts); } if (ts.tv_sec < 0 || (ts.tv_sec == 0 && ts.tv_nsec == 0)) { error = EWOULDBLOCK; break; } if (ts.tv_sec > INT32_MAX / 2) { over = ts.tv_sec - INT32_MAX / 2; ts.tv_sec -= over; } else over = 0; tmp = tstosbt(ts); prec = tmp; prec >>= tc_precexp; if (TIMESEL(&sbt, tmp)) sbt += tc_tick_sbt; sbt += tmp; error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp", sbt, prec, C_ABSOLUTE); } while (error == 0 && is_abs_real && td->td_rtcgen == 0); td->td_rtcgen = 0; if (error != EWOULDBLOCK) { if (TIMESEL(&sbtt, tmp)) sbtt += tc_tick_sbt; if (sbtt >= sbt) return (0); if (error == ERESTART) error = EINTR; if ((flags & TIMER_ABSTIME) == 0 && rmt != NULL) { ts = sbttots(sbt - sbtt); ts.tv_sec += over; if (ts.tv_sec < 0) timespecclear(&ts); *rmt = ts; } return (error); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct nanosleep_args { struct timespec *rqtp; struct timespec *rmtp; }; #endif /* ARGSUSED */ int sys_nanosleep(struct thread *td, struct nanosleep_args *uap) { return (user_clock_nanosleep(td, CLOCK_REALTIME, TIMER_RELTIME, uap->rqtp, uap->rmtp)); } #ifndef _SYS_SYSPROTO_H_ struct clock_nanosleep_args { clockid_t clock_id; int flags; struct timespec *rqtp; struct timespec *rmtp; }; #endif /* ARGSUSED */ int sys_clock_nanosleep(struct thread *td, struct clock_nanosleep_args *uap) { int error; error = user_clock_nanosleep(td, uap->clock_id, uap->flags, uap->rqtp, uap->rmtp); return (kern_posix_error(td, error)); } static int user_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, const struct timespec *ua_rqtp, struct timespec *ua_rmtp) { struct timespec rmt, rqt; int error, error2; error = copyin(ua_rqtp, &rqt, sizeof(rqt)); if (error) return (error); error = kern_clock_nanosleep(td, clock_id, flags, &rqt, &rmt); if (error == EINTR && ua_rmtp != NULL && (flags & TIMER_ABSTIME) == 0) { error2 = copyout(&rmt, ua_rmtp, sizeof(rmt)); if (error2 != 0) error = error2; } return (error); } #ifndef _SYS_SYSPROTO_H_ struct gettimeofday_args { struct timeval *tp; struct timezone *tzp; }; #endif /* ARGSUSED */ int sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap) { struct timeval atv; struct timezone rtz; int error = 0; if (uap->tp) { microtime(&atv); error = copyout(&atv, uap->tp, sizeof (atv)); } if (error == 0 && uap->tzp != NULL) { rtz.tz_minuteswest = 0; rtz.tz_dsttime = 0; error = copyout(&rtz, uap->tzp, sizeof (rtz)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct settimeofday_args { struct timeval *tv; struct timezone *tzp; }; #endif /* ARGSUSED */ int sys_settimeofday(struct thread *td, struct settimeofday_args *uap) { struct timeval atv, *tvp; struct timezone atz, *tzp; int error; if (uap->tv) { error = copyin(uap->tv, &atv, sizeof(atv)); if (error) return (error); tvp = &atv; } else tvp = NULL; if (uap->tzp) { error = copyin(uap->tzp, &atz, sizeof(atz)); if (error) return (error); tzp = &atz; } else tzp = NULL; return (kern_settimeofday(td, tvp, tzp)); } int kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp) { int error; error = priv_check(td, PRIV_SETTIMEOFDAY); if (error) return (error); /* Verify all parameters before changing time. */ if (tv) { if (tv->tv_usec < 0 || tv->tv_usec >= 1000000 || tv->tv_sec < 0) return (EINVAL); error = settime(td, tv); } return (error); } /* * Get value of an interval timer. The process virtual and profiling virtual * time timers are kept in the p_stats area, since they can be swapped out. * These are kept internally in the way they are specified externally: in * time until they expire. * * The real time interval timer is kept in the process table slot for the * process, and its value (it_value) is kept as an absolute time rather than * as a delta, so that it is easy to keep periodic real-time signals from * drifting. * * Virtual time timers are processed in the hardclock() routine of * kern_clock.c. The real time timer is processed by a timeout routine, * called from the softclock() routine. Since a callout may be delayed in * real time due to interrupt processing in the system, it is possible for * the real time timeout routine (realitexpire, given below), to be delayed * in real time past when it is supposed to occur. It does not suffice, * therefore, to reload the real timer .it_value from the real time timers * .it_interval. Rather, we compute the next time in absolute time the timer * should go off. */ #ifndef _SYS_SYSPROTO_H_ struct getitimer_args { u_int which; struct itimerval *itv; }; #endif int sys_getitimer(struct thread *td, struct getitimer_args *uap) { struct itimerval aitv; int error; error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); return (copyout(&aitv, uap->itv, sizeof (struct itimerval))); } int kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv) { struct proc *p = td->td_proc; struct timeval ctv; if (which > ITIMER_PROF) return (EINVAL); if (which == ITIMER_REAL) { /* * Convert from absolute to relative time in .it_value * part of real time timer. If time for real time timer * has passed return 0, else return difference between * current time and time for the timer to go off. */ PROC_LOCK(p); *aitv = p->p_realtimer; PROC_UNLOCK(p); if (timevalisset(&aitv->it_value)) { microuptime(&ctv); if (timevalcmp(&aitv->it_value, &ctv, <)) timevalclear(&aitv->it_value); else timevalsub(&aitv->it_value, &ctv); } } else { PROC_ITIMLOCK(p); *aitv = p->p_stats->p_timer[which]; PROC_ITIMUNLOCK(p); } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktritimerval(aitv); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct setitimer_args { u_int which; struct itimerval *itv, *oitv; }; #endif int sys_setitimer(struct thread *td, struct setitimer_args *uap) { struct itimerval aitv, oitv; int error; if (uap->itv == NULL) { uap->itv = uap->oitv; return (sys_getitimer(td, (struct getitimer_args *)uap)); } if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval)))) return (error); error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); return (copyout(&oitv, uap->oitv, sizeof(struct itimerval))); } int kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv, struct itimerval *oitv) { struct proc *p = td->td_proc; struct timeval ctv; sbintime_t sbt, pr; if (aitv == NULL) return (kern_getitimer(td, which, oitv)); if (which > ITIMER_PROF) return (EINVAL); #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktritimerval(aitv); #endif if (itimerfix(&aitv->it_value) || aitv->it_value.tv_sec > INT32_MAX / 2) return (EINVAL); if (!timevalisset(&aitv->it_value)) timevalclear(&aitv->it_interval); else if (itimerfix(&aitv->it_interval) || aitv->it_interval.tv_sec > INT32_MAX / 2) return (EINVAL); if (which == ITIMER_REAL) { PROC_LOCK(p); if (timevalisset(&p->p_realtimer.it_value)) callout_stop(&p->p_itcallout); microuptime(&ctv); if (timevalisset(&aitv->it_value)) { pr = tvtosbt(aitv->it_value) >> tc_precexp; timevaladd(&aitv->it_value, &ctv); sbt = tvtosbt(aitv->it_value); callout_reset_sbt(&p->p_itcallout, sbt, pr, realitexpire, p, C_ABSOLUTE); } *oitv = p->p_realtimer; p->p_realtimer = *aitv; PROC_UNLOCK(p); if (timevalisset(&oitv->it_value)) { if (timevalcmp(&oitv->it_value, &ctv, <)) timevalclear(&oitv->it_value); else timevalsub(&oitv->it_value, &ctv); } } else { if (aitv->it_interval.tv_sec == 0 && aitv->it_interval.tv_usec != 0 && aitv->it_interval.tv_usec < tick) aitv->it_interval.tv_usec = tick; if (aitv->it_value.tv_sec == 0 && aitv->it_value.tv_usec != 0 && aitv->it_value.tv_usec < tick) aitv->it_value.tv_usec = tick; PROC_ITIMLOCK(p); *oitv = p->p_stats->p_timer[which]; p->p_stats->p_timer[which] = *aitv; PROC_ITIMUNLOCK(p); } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktritimerval(oitv); #endif return (0); } static void realitexpire_reset_callout(struct proc *p, sbintime_t *isbtp) { sbintime_t prec; prec = isbtp == NULL ? tvtosbt(p->p_realtimer.it_interval) : *isbtp; callout_reset_sbt(&p->p_itcallout, tvtosbt(p->p_realtimer.it_value), prec >> tc_precexp, realitexpire, p, C_ABSOLUTE); } void itimer_proc_continue(struct proc *p) { struct timeval ctv; struct itimer *it; int id; PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag2 & P2_ITSTOPPED) != 0) { p->p_flag2 &= ~P2_ITSTOPPED; microuptime(&ctv); if (timevalcmp(&p->p_realtimer.it_value, &ctv, >=)) realitexpire(p); else realitexpire_reset_callout(p, NULL); } if (p->p_itimers != NULL) { for (id = 3; id < TIMER_MAX; id++) { it = p->p_itimers->its_timers[id]; if (it == NULL) continue; if ((it->it_flags & ITF_PSTOPPED) != 0) { ITIMER_LOCK(it); if ((it->it_flags & ITF_PSTOPPED) != 0) { it->it_flags &= ~ITF_PSTOPPED; if ((it->it_flags & ITF_DELETING) == 0) realtimer_expire_l(it, true); } ITIMER_UNLOCK(it); } } } } /* * Real interval timer expired: * send process whose timer expired an alarm signal. * If time is not set up to reload, then just return. * Else compute next time timer should go off which is > current time. * This is where delay in processing this timeout causes multiple * SIGALRM calls to be compressed into one. * tvtohz() always adds 1 to allow for the time until the next clock * interrupt being strictly less than 1 clock tick, but we don't want * that here since we want to appear to be in sync with the clock * interrupt even when we're delayed. */ void realitexpire(void *arg) { struct proc *p; struct timeval ctv; sbintime_t isbt; p = (struct proc *)arg; kern_psignal(p, SIGALRM); if (!timevalisset(&p->p_realtimer.it_interval)) { timevalclear(&p->p_realtimer.it_value); return; } isbt = tvtosbt(p->p_realtimer.it_interval); if (isbt >= sbt_timethreshold) getmicrouptime(&ctv); else microuptime(&ctv); do { timevaladd(&p->p_realtimer.it_value, &p->p_realtimer.it_interval); } while (timevalcmp(&p->p_realtimer.it_value, &ctv, <=)); if (P_SHOULDSTOP(p) || P_KILLED(p)) { p->p_flag2 |= P2_ITSTOPPED; return; } p->p_flag2 &= ~P2_ITSTOPPED; realitexpire_reset_callout(p, &isbt); } /* * Check that a proposed value to load into the .it_value or * .it_interval part of an interval timer is acceptable, and * fix it to have at least minimal value (i.e. if it is less * than the resolution of the clock, round it up.) */ int itimerfix(struct timeval *tv) { if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000) return (EINVAL); if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < (u_int)tick / 16) tv->tv_usec = (u_int)tick / 16; return (0); } /* * Decrement an interval timer by a specified number * of microseconds, which must be less than a second, * i.e. < 1000000. If the timer expires, then reload * it. In this case, carry over (usec - old value) to * reduce the value reloaded into the timer so that * the timer does not drift. This routine assumes * that it is called in a context where the timers * on which it is operating cannot change in value. */ int itimerdecr(struct itimerval *itp, int usec) { if (itp->it_value.tv_usec < usec) { if (itp->it_value.tv_sec == 0) { /* expired, and already in next interval */ usec -= itp->it_value.tv_usec; goto expire; } itp->it_value.tv_usec += 1000000; itp->it_value.tv_sec--; } itp->it_value.tv_usec -= usec; usec = 0; if (timevalisset(&itp->it_value)) return (1); /* expired, exactly at end of interval */ expire: if (timevalisset(&itp->it_interval)) { itp->it_value = itp->it_interval; itp->it_value.tv_usec -= usec; if (itp->it_value.tv_usec < 0) { itp->it_value.tv_usec += 1000000; itp->it_value.tv_sec--; } } else itp->it_value.tv_usec = 0; /* sec is already 0 */ return (0); } /* * Add and subtract routines for timevals. * N.B.: subtract routine doesn't deal with * results which are before the beginning, * it just gets very confused in this case. * Caveat emptor. */ void timevaladd(struct timeval *t1, const struct timeval *t2) { t1->tv_sec += t2->tv_sec; t1->tv_usec += t2->tv_usec; timevalfix(t1); } void timevalsub(struct timeval *t1, const struct timeval *t2) { t1->tv_sec -= t2->tv_sec; t1->tv_usec -= t2->tv_usec; timevalfix(t1); } static void timevalfix(struct timeval *t1) { if (t1->tv_usec < 0) { t1->tv_sec--; t1->tv_usec += 1000000; } if (t1->tv_usec >= 1000000) { t1->tv_sec++; t1->tv_usec -= 1000000; } } /* * ratecheck(): simple time-based rate-limit checking. */ int ratecheck(struct timeval *lasttime, const struct timeval *mininterval) { struct timeval tv, delta; int rv = 0; getmicrouptime(&tv); /* NB: 10ms precision */ delta = tv; timevalsub(&delta, lasttime); /* * check for 0,0 is so that the message will be seen at least once, * even if interval is huge. */ if (timevalcmp(&delta, mininterval, >=) || (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) { *lasttime = tv; rv = 1; } return (rv); } /* * ppsratecheck(): packets (or events) per second limitation. * * Return 0 if the limit is to be enforced (e.g. the caller * should drop a packet because of the rate limitation). * * maxpps of 0 always causes zero to be returned. maxpps of -1 * always causes 1 to be returned; this effectively defeats rate * limiting. * * Note that we maintain the struct timeval for compatibility * with other bsd systems. We reuse the storage and just monitor * clock ticks for minimal overhead. */ int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) { int now; /* * Reset the last time and counter if this is the first call * or more than a second has passed since the last update of * lasttime. */ now = ticks; if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) { lasttime->tv_sec = now; *curpps = 1; return (maxpps != 0); } else { (*curpps)++; /* NB: ignore potential overflow */ return (maxpps < 0 || *curpps <= maxpps); } } static void itimer_start(void) { static const struct kclock rt_clock = { .timer_create = realtimer_create, .timer_delete = realtimer_delete, .timer_settime = realtimer_settime, .timer_gettime = realtimer_gettime, }; itimer_zone = uma_zcreate("itimer", sizeof(struct itimer), NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0); register_posix_clock(CLOCK_REALTIME, &rt_clock); register_posix_clock(CLOCK_MONOTONIC, &rt_clock); p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L); p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX); p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX); } static int register_posix_clock(int clockid, const struct kclock *clk) { if ((unsigned)clockid >= MAX_CLOCKS) { printf("%s: invalid clockid\n", __func__); return (0); } posix_clocks[clockid] = *clk; return (1); } static int itimer_init(void *mem, int size, int flags) { struct itimer *it; it = (struct itimer *)mem; mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF); return (0); } static void itimer_fini(void *mem, int size) { struct itimer *it; it = (struct itimer *)mem; mtx_destroy(&it->it_mtx); } static void itimer_enter(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); it->it_usecount++; } static void itimer_leave(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); KASSERT(it->it_usecount > 0, ("invalid it_usecount")); if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0) wakeup(it); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_create_args { clockid_t clock_id; struct sigevent * evp; int * timerid; }; #endif int sys_ktimer_create(struct thread *td, struct ktimer_create_args *uap) { struct sigevent *evp, ev; int id; int error; if (uap->evp == NULL) { evp = NULL; } else { error = copyin(uap->evp, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1); if (error == 0) { error = copyout(&id, uap->timerid, sizeof(int)); if (error != 0) kern_ktimer_delete(td, id); } return (error); } int kern_ktimer_create(struct thread *td, clockid_t clock_id, struct sigevent *evp, int *timerid, int preset_id) { struct proc *p = td->td_proc; struct itimer *it; int id; int error; if (clock_id < 0 || clock_id >= MAX_CLOCKS) return (EINVAL); if (posix_clocks[clock_id].timer_create == NULL) return (EINVAL); if (evp != NULL) { if (evp->sigev_notify != SIGEV_NONE && evp->sigev_notify != SIGEV_SIGNAL && evp->sigev_notify != SIGEV_THREAD_ID) return (EINVAL); if ((evp->sigev_notify == SIGEV_SIGNAL || evp->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(evp->sigev_signo)) return (EINVAL); } if (p->p_itimers == NULL) itimers_alloc(p); it = uma_zalloc(itimer_zone, M_WAITOK); it->it_flags = 0; it->it_usecount = 0; timespecclear(&it->it_time.it_value); timespecclear(&it->it_time.it_interval); it->it_overrun = 0; it->it_overrun_last = 0; it->it_clockid = clock_id; it->it_proc = p; ksiginfo_init(&it->it_ksi); it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT; error = CLOCK_CALL(clock_id, timer_create, (it)); if (error != 0) goto out; PROC_LOCK(p); if (preset_id != -1) { KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id")); id = preset_id; if (p->p_itimers->its_timers[id] != NULL) { PROC_UNLOCK(p); error = 0; goto out; } } else { /* * Find a free timer slot, skipping those reserved * for setitimer(). */ for (id = 3; id < TIMER_MAX; id++) if (p->p_itimers->its_timers[id] == NULL) break; if (id == TIMER_MAX) { PROC_UNLOCK(p); error = EAGAIN; goto out; } } p->p_itimers->its_timers[id] = it; if (evp != NULL) it->it_sigev = *evp; else { it->it_sigev.sigev_notify = SIGEV_SIGNAL; switch (clock_id) { default: case CLOCK_REALTIME: it->it_sigev.sigev_signo = SIGALRM; break; case CLOCK_VIRTUAL: it->it_sigev.sigev_signo = SIGVTALRM; break; case CLOCK_PROF: it->it_sigev.sigev_signo = SIGPROF; break; } it->it_sigev.sigev_value.sival_int = id; } if (it->it_sigev.sigev_notify == SIGEV_SIGNAL || it->it_sigev.sigev_notify == SIGEV_THREAD_ID) { it->it_ksi.ksi_signo = it->it_sigev.sigev_signo; it->it_ksi.ksi_code = SI_TIMER; it->it_ksi.ksi_value = it->it_sigev.sigev_value; it->it_ksi.ksi_timerid = id; } PROC_UNLOCK(p); *timerid = id; return (0); out: ITIMER_LOCK(it); CLOCK_CALL(it->it_clockid, timer_delete, (it)); ITIMER_UNLOCK(it); uma_zfree(itimer_zone, it); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_delete_args { int timerid; }; #endif int sys_ktimer_delete(struct thread *td, struct ktimer_delete_args *uap) { return (kern_ktimer_delete(td, uap->timerid)); } static struct itimer * itimer_find(struct proc *p, int timerid) { struct itimer *it; PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_itimers == NULL) || (timerid < 0) || (timerid >= TIMER_MAX) || (it = p->p_itimers->its_timers[timerid]) == NULL) { return (NULL); } ITIMER_LOCK(it); if ((it->it_flags & ITF_DELETING) != 0) { ITIMER_UNLOCK(it); it = NULL; } return (it); } int kern_ktimer_delete(struct thread *td, int timerid) { struct proc *p = td->td_proc; struct itimer *it; PROC_LOCK(p); it = itimer_find(p, timerid); if (it == NULL) { PROC_UNLOCK(p); return (EINVAL); } PROC_UNLOCK(p); it->it_flags |= ITF_DELETING; while (it->it_usecount > 0) { it->it_flags |= ITF_WANTED; msleep(it, &it->it_mtx, PPAUSE, "itimer", 0); } it->it_flags &= ~ITF_WANTED; CLOCK_CALL(it->it_clockid, timer_delete, (it)); ITIMER_UNLOCK(it); PROC_LOCK(p); if (KSI_ONQ(&it->it_ksi)) sigqueue_take(&it->it_ksi); p->p_itimers->its_timers[timerid] = NULL; PROC_UNLOCK(p); uma_zfree(itimer_zone, it); return (0); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_settime_args { int timerid; int flags; const struct itimerspec * value; struct itimerspec * ovalue; }; #endif int sys_ktimer_settime(struct thread *td, struct ktimer_settime_args *uap) { struct itimerspec val, oval, *ovalp; int error; error = copyin(uap->value, &val, sizeof(val)); if (error != 0) return (error); ovalp = uap->ovalue != NULL ? &oval : NULL; error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp); if (error == 0 && uap->ovalue != NULL) error = copyout(ovalp, uap->ovalue, sizeof(*ovalp)); return (error); } int kern_ktimer_settime(struct thread *td, int timer_id, int flags, struct itimerspec *val, struct itimerspec *oval) { struct proc *p; struct itimer *it; int error; p = td->td_proc; PROC_LOCK(p); if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { PROC_UNLOCK(p); itimer_enter(it); error = CLOCK_CALL(it->it_clockid, timer_settime, (it, flags, val, oval)); itimer_leave(it); ITIMER_UNLOCK(it); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_gettime_args { int timerid; struct itimerspec * value; }; #endif int sys_ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap) { struct itimerspec val; int error; error = kern_ktimer_gettime(td, uap->timerid, &val); if (error == 0) error = copyout(&val, uap->value, sizeof(val)); return (error); } int kern_ktimer_gettime(struct thread *td, int timer_id, struct itimerspec *val) { struct proc *p; struct itimer *it; int error; p = td->td_proc; PROC_LOCK(p); if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { PROC_UNLOCK(p); itimer_enter(it); error = CLOCK_CALL(it->it_clockid, timer_gettime, (it, val)); itimer_leave(it); ITIMER_UNLOCK(it); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct timer_getoverrun_args { int timerid; }; #endif int sys_ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap) { return (kern_ktimer_getoverrun(td, uap->timerid)); } int kern_ktimer_getoverrun(struct thread *td, int timer_id) { struct proc *p = td->td_proc; struct itimer *it; int error ; PROC_LOCK(p); if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { td->td_retval[0] = it->it_overrun_last; ITIMER_UNLOCK(it); PROC_UNLOCK(p); error = 0; } return (error); } static int realtimer_create(struct itimer *it) { callout_init_mtx(&it->it_callout, &it->it_mtx, 0); return (0); } static int realtimer_delete(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); /* * clear timer's value and interval to tell realtimer_expire * to not rearm the timer. */ timespecclear(&it->it_time.it_value); timespecclear(&it->it_time.it_interval); ITIMER_UNLOCK(it); callout_drain(&it->it_callout); ITIMER_LOCK(it); return (0); } static int realtimer_gettime(struct itimer *it, struct itimerspec *ovalue) { struct timespec cts; mtx_assert(&it->it_mtx, MA_OWNED); realtimer_clocktime(it->it_clockid, &cts); *ovalue = it->it_time; if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) { timespecsub(&ovalue->it_value, &cts, &ovalue->it_value); if (ovalue->it_value.tv_sec < 0 || (ovalue->it_value.tv_sec == 0 && ovalue->it_value.tv_nsec == 0)) { ovalue->it_value.tv_sec = 0; ovalue->it_value.tv_nsec = 1; } } return (0); } static int realtimer_settime(struct itimer *it, int flags, struct itimerspec *value, struct itimerspec *ovalue) { struct timespec cts, ts; struct timeval tv; struct itimerspec val; mtx_assert(&it->it_mtx, MA_OWNED); val = *value; if (itimespecfix(&val.it_value)) return (EINVAL); if (timespecisset(&val.it_value)) { if (itimespecfix(&val.it_interval)) return (EINVAL); } else { timespecclear(&val.it_interval); } if (ovalue != NULL) realtimer_gettime(it, ovalue); it->it_time = val; if (timespecisset(&val.it_value)) { realtimer_clocktime(it->it_clockid, &cts); ts = val.it_value; if ((flags & TIMER_ABSTIME) == 0) { /* Convert to absolute time. */ timespecadd(&it->it_time.it_value, &cts, &it->it_time.it_value); } else { timespecsub(&ts, &cts, &ts); /* * We don't care if ts is negative, tztohz will * fix it. */ } TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } else { callout_stop(&it->it_callout); } return (0); } static void realtimer_clocktime(clockid_t id, struct timespec *ts) { if (id == CLOCK_REALTIME) getnanotime(ts); else /* CLOCK_MONOTONIC */ getnanouptime(ts); } int itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi) { struct itimer *it; PROC_LOCK_ASSERT(p, MA_OWNED); it = itimer_find(p, timerid); if (it != NULL) { ksi->ksi_overrun = it->it_overrun; it->it_overrun_last = it->it_overrun; it->it_overrun = 0; ITIMER_UNLOCK(it); return (0); } return (EINVAL); } static int itimespecfix(struct timespec *ts) { if (!timespecvalid_interval(ts)) return (EINVAL); if ((UINT64_MAX - ts->tv_nsec) / NS_PER_SEC < ts->tv_sec) return (EINVAL); if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000) ts->tv_nsec = tick * 1000; return (0); } #define timespectons(tsp) \ ((uint64_t)(tsp)->tv_sec * NS_PER_SEC + (tsp)->tv_nsec) #define timespecfromns(ns) (struct timespec){ \ .tv_sec = (ns) / NS_PER_SEC, \ .tv_nsec = (ns) % NS_PER_SEC \ } static void realtimer_expire_l(struct itimer *it, bool proc_locked) { struct timespec cts, ts; struct timeval tv; struct proc *p; uint64_t interval, now, overruns, value; realtimer_clocktime(it->it_clockid, &cts); /* Only fire if time is reached. */ if (timespeccmp(&cts, &it->it_time.it_value, >=)) { if (timespecisset(&it->it_time.it_interval)) { timespecadd(&it->it_time.it_value, &it->it_time.it_interval, &it->it_time.it_value); interval = timespectons(&it->it_time.it_interval); value = timespectons(&it->it_time.it_value); now = timespectons(&cts); if (now >= value) { /* * We missed at least one period. */ overruns = howmany(now - value + 1, interval); if (it->it_overrun + overruns >= it->it_overrun && it->it_overrun + overruns <= INT_MAX) { it->it_overrun += (int)overruns; } else { it->it_overrun = INT_MAX; it->it_ksi.ksi_errno = ERANGE; } value = now + interval - (now - value) % interval; it->it_time.it_value = timespecfromns(value); } } else { /* single shot timer ? */ timespecclear(&it->it_time.it_value); } p = it->it_proc; if (timespecisset(&it->it_time.it_value)) { if (P_SHOULDSTOP(p) || P_KILLED(p)) { it->it_flags |= ITF_PSTOPPED; } else { timespecsub(&it->it_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } } itimer_enter(it); ITIMER_UNLOCK(it); if (proc_locked) PROC_UNLOCK(p); itimer_fire(it); if (proc_locked) PROC_LOCK(p); ITIMER_LOCK(it); itimer_leave(it); } else if (timespecisset(&it->it_time.it_value)) { p = it->it_proc; if (P_SHOULDSTOP(p) || P_KILLED(p)) { it->it_flags |= ITF_PSTOPPED; } else { ts = it->it_time.it_value; timespecsub(&ts, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } } } /* Timeout callback for realtime timer */ static void realtimer_expire(void *arg) { realtimer_expire_l(arg, false); } static void itimer_fire(struct itimer *it) { struct proc *p = it->it_proc; struct thread *td; if (it->it_sigev.sigev_notify == SIGEV_SIGNAL || it->it_sigev.sigev_notify == SIGEV_THREAD_ID) { if (sigev_findtd(p, &it->it_sigev, &td) != 0) { ITIMER_LOCK(it); timespecclear(&it->it_time.it_value); timespecclear(&it->it_time.it_interval); callout_stop(&it->it_callout); ITIMER_UNLOCK(it); return; } if (!KSI_ONQ(&it->it_ksi)) { it->it_ksi.ksi_errno = 0; ksiginfo_set_sigev(&it->it_ksi, &it->it_sigev); tdsendsignal(p, td, it->it_ksi.ksi_signo, &it->it_ksi); } else { if (it->it_overrun < INT_MAX) it->it_overrun++; else it->it_ksi.ksi_errno = ERANGE; } PROC_UNLOCK(p); } } static void itimers_alloc(struct proc *p) { struct itimers *its; its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO); PROC_LOCK(p); if (p->p_itimers == NULL) { p->p_itimers = its; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); free(its, M_SUBPROC); } } /* Clean up timers when some process events are being triggered. */ static void itimers_event_exit_exec(int start_idx, struct proc *p) { struct itimers *its; struct itimer *it; int i; its = p->p_itimers; if (its == NULL) return; for (i = start_idx; i < TIMER_MAX; ++i) { if ((it = its->its_timers[i]) != NULL) kern_ktimer_delete(curthread, i); } if (its->its_timers[0] == NULL && its->its_timers[1] == NULL && its->its_timers[2] == NULL) { /* Synchronize with itimer_proc_continue(). */ PROC_LOCK(p); p->p_itimers = NULL; PROC_UNLOCK(p); free(its, M_SUBPROC); } } void itimers_exec(struct proc *p) { /* * According to susv3, XSI interval timers should be inherited * by new image. */ itimers_event_exit_exec(3, p); } void itimers_exit(struct proc *p) { itimers_event_exit_exec(0, p); } diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c index c250d890bf05..f847bd771a84 100644 --- a/sys/kern/kern_umtx.c +++ b/sys/kern/kern_umtx.c @@ -1,5088 +1,5087 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015, 2016 The FreeBSD Foundation * Copyright (c) 2004, David Xu * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_umtx_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #endif #define _UMUTEX_TRY 1 #define _UMUTEX_WAIT 2 #ifdef UMTX_PROFILING #define UPROF_PERC_BIGGER(w, f, sw, sf) \ (((w) > (sw)) || ((w) == (sw) && (f) > (sf))) #endif #define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED) #ifdef INVARIANTS #define UMTXQ_ASSERT_LOCKED_BUSY(key) do { \ struct umtxq_chain *uc; \ \ uc = umtxq_getchain(key); \ mtx_assert(&uc->uc_lock, MA_OWNED); \ KASSERT(uc->uc_busy != 0, ("umtx chain is not busy")); \ } while (0) #else #define UMTXQ_ASSERT_LOCKED_BUSY(key) do {} while (0) #endif /* * Don't propagate time-sharing priority, there is a security reason, * a user can simply introduce PI-mutex, let thread A lock the mutex, * and let another thread B block on the mutex, because B is * sleeping, its priority will be boosted, this causes A's priority to * be boosted via priority propagating too and will never be lowered even * if it is using 100%CPU, this is unfair to other processes. */ #define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\ (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\ PRI_MAX_TIMESHARE : (td)->td_user_pri) #define GOLDEN_RATIO_PRIME 2654404609U #ifndef UMTX_CHAINS #define UMTX_CHAINS 512 #endif #define UMTX_SHIFTS (__WORD_BIT - 9) #define GET_SHARE(flags) \ (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE) #define BUSY_SPINS 200 struct umtx_copyops { int (*copyin_timeout)(const void *uaddr, struct timespec *tsp); int (*copyin_umtx_time)(const void *uaddr, size_t size, struct _umtx_time *tp); int (*copyin_robust_lists)(const void *uaddr, size_t size, struct umtx_robust_lists_params *rbp); int (*copyout_timeout)(void *uaddr, size_t size, struct timespec *tsp); const size_t timespec_sz; const size_t umtx_time_sz; const bool compat32; }; _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32"); _Static_assert(__offsetof(struct umutex, m_spare[0]) == __offsetof(struct umutex32, m_spare[0]), "m_spare32"); int umtx_shm_vnobj_persistent = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN, &umtx_shm_vnobj_persistent, 0, "False forces destruction of umtx attached to file, on last close"); static int umtx_max_rb = 1000; SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN, &umtx_max_rb, 0, "Maximum number of robust mutexes allowed for each thread"); static uma_zone_t umtx_pi_zone; static struct umtxq_chain umtxq_chains[2][UMTX_CHAINS]; static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory"); static int umtx_pi_allocated; static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "umtx debug"); SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD, &umtx_pi_allocated, 0, "Allocated umtx_pi"); static int umtx_verbose_rb = 1; SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN, &umtx_verbose_rb, 0, ""); #ifdef UMTX_PROFILING static long max_length; SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length"); static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "umtx chain stats"); #endif static inline void umtx_abs_timeout_init2(struct umtx_abs_timeout *timo, const struct _umtx_time *umtxtime); static int umtx_abs_timeout_gethz(struct umtx_abs_timeout *timo); static inline void umtx_abs_timeout_update(struct umtx_abs_timeout *timo); static void umtx_shm_init(void); static void umtxq_sysinit(void *); static void umtxq_hash(struct umtx_key *key); static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb); static void umtx_thread_cleanup(struct thread *td); SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL); #define umtxq_signal(key, nwake) umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE) static struct mtx umtx_lock; #ifdef UMTX_PROFILING static void umtx_init_profiling(void) { struct sysctl_oid *chain_oid; char chain_name[10]; int i; for (i = 0; i < UMTX_CHAINS; ++i) { snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, chain_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "umtx hash stats"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL); } } static int sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS) { char buf[512]; struct sbuf sb; struct umtxq_chain *uc; u_int fract, i, j, tot, whole; u_int sf0, sf1, sf2, sf3, sf4; u_int si0, si1, si2, si3, si4; u_int sw0, sw1, sw2, sw3, sw4; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < 2; i++) { tot = 0; for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); tot += uc->max_length; mtx_unlock(&uc->uc_lock); } if (tot == 0) sbuf_printf(&sb, "%u) Empty ", i); else { sf0 = sf1 = sf2 = sf3 = sf4 = 0; si0 = si1 = si2 = si3 = si4 = 0; sw0 = sw1 = sw2 = sw3 = sw4 = 0; for (j = 0; j < UMTX_CHAINS; j++) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); whole = uc->max_length * 100; mtx_unlock(&uc->uc_lock); fract = (whole % tot) * 100; if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) { sf0 = fract; si0 = j; sw0 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw1, sf1)) { sf1 = fract; si1 = j; sw1 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw2, sf2)) { sf2 = fract; si2 = j; sw2 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw3, sf3)) { sf3 = fract; si3 = j; sw3 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw4, sf4)) { sf4 = fract; si4 = j; sw4 = whole; } } sbuf_printf(&sb, "queue %u:\n", i); sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot, sf0 / tot, si0); sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot, sf1 / tot, si1); sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot, sf2 / tot, si2); sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot, sf3 / tot, si3); sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot, sf4 / tot, si4); } } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } static int sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS) { struct umtxq_chain *uc; u_int i, j; int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (clear != 0) { for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); uc->length = 0; uc->max_length = 0; mtx_unlock(&uc->uc_lock); } } } return (0); } SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics"); SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length"); #endif static void umtxq_sysinit(void *arg __unused) { int i, j; umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL, MTX_DEF | MTX_DUPOK); LIST_INIT(&umtxq_chains[i][j].uc_queue[0]); LIST_INIT(&umtxq_chains[i][j].uc_queue[1]); LIST_INIT(&umtxq_chains[i][j].uc_spare_queue); TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list); umtxq_chains[i][j].uc_busy = 0; umtxq_chains[i][j].uc_waiters = 0; #ifdef UMTX_PROFILING umtxq_chains[i][j].length = 0; umtxq_chains[i][j].max_length = 0; #endif } } #ifdef UMTX_PROFILING umtx_init_profiling(); #endif mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF); umtx_shm_init(); } struct umtx_q * umtxq_alloc(void) { struct umtx_q *uq; uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO); uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO); TAILQ_INIT(&uq->uq_spare_queue->head); TAILQ_INIT(&uq->uq_pi_contested); uq->uq_inherited_pri = PRI_MAX; return (uq); } void umtxq_free(struct umtx_q *uq) { MPASS(uq->uq_spare_queue != NULL); free(uq->uq_spare_queue, M_UMTX); free(uq, M_UMTX); } static inline void umtxq_hash(struct umtx_key *key) { unsigned n; n = (uintptr_t)key->info.both.a + key->info.both.b; key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS; } struct umtxq_chain * umtxq_getchain(struct umtx_key *key) { if (key->type <= TYPE_SEM) return (&umtxq_chains[1][key->hash]); return (&umtxq_chains[0][key->hash]); } /* * Set chain to busy state when following operation * may be blocked (kernel mutex can not be used). */ void umtxq_busy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); if (uc->uc_busy) { #ifdef SMP if (smp_cpus > 1) { int count = BUSY_SPINS; if (count > 0) { umtxq_unlock(key); while (uc->uc_busy && --count > 0) cpu_spinwait(); umtxq_lock(key); } } #endif while (uc->uc_busy) { uc->uc_waiters++; msleep(uc, &uc->uc_lock, 0, "umtxqb", 0); uc->uc_waiters--; } } uc->uc_busy = 1; } /* * Unbusy a chain. */ void umtxq_unbusy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); KASSERT(uc->uc_busy != 0, ("not busy")); uc->uc_busy = 0; if (uc->uc_waiters) wakeup_one(uc); } void umtxq_unbusy_unlocked(struct umtx_key *key) { umtxq_lock(key); umtxq_unbusy(key); umtxq_unlock(key); } static struct umtxq_queue * umtxq_queue_lookup(struct umtx_key *key, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); LIST_FOREACH(uh, &uc->uc_queue[q], link) { if (umtx_key_match(&uh->key, key)) return (uh); } return (NULL); } void umtxq_insert_queue(struct umtx_q *uq, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue")); uh = umtxq_queue_lookup(&uq->uq_key, q); if (uh != NULL) { LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link); } else { uh = uq->uq_spare_queue; uh->key = uq->uq_key; LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link); #ifdef UMTX_PROFILING uc->length++; if (uc->length > uc->max_length) { uc->max_length = uc->length; if (uc->max_length > max_length) max_length = uc->max_length; } #endif } uq->uq_spare_queue = NULL; TAILQ_INSERT_TAIL(&uh->head, uq, uq_link); uh->length++; uq->uq_flags |= UQF_UMTXQ; uq->uq_cur_queue = uh; return; } void umtxq_remove_queue(struct umtx_q *uq, int q) { struct umtxq_chain *uc; struct umtxq_queue *uh; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); if (uq->uq_flags & UQF_UMTXQ) { uh = uq->uq_cur_queue; TAILQ_REMOVE(&uh->head, uq, uq_link); uh->length--; uq->uq_flags &= ~UQF_UMTXQ; if (TAILQ_EMPTY(&uh->head)) { KASSERT(uh->length == 0, ("inconsistent umtxq_queue length")); #ifdef UMTX_PROFILING uc->length--; #endif LIST_REMOVE(uh, link); } else { uh = LIST_FIRST(&uc->uc_spare_queue); KASSERT(uh != NULL, ("uc_spare_queue is empty")); LIST_REMOVE(uh, link); } uq->uq_spare_queue = uh; uq->uq_cur_queue = NULL; } } /* * Check if there are multiple waiters */ int umtxq_count(struct umtx_key *key) { struct umtxq_queue *uh; UMTXQ_LOCKED_ASSERT(umtxq_getchain(key)); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) return (uh->length); return (0); } /* * Check if there are multiple PI waiters and returns first * waiter. */ static int umtxq_count_pi(struct umtx_key *key, struct umtx_q **first) { struct umtxq_queue *uh; *first = NULL; UMTXQ_LOCKED_ASSERT(umtxq_getchain(key)); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) { *first = TAILQ_FIRST(&uh->head); return (uh->length); } return (0); } /* * Wake up threads waiting on an userland object by a bit mask. */ int umtxq_signal_mask(struct umtx_key *key, int n_wake, u_int bitset) { struct umtxq_queue *uh; struct umtx_q *uq, *uq_temp; int ret; ret = 0; UMTXQ_LOCKED_ASSERT(umtxq_getchain(key)); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh == NULL) return (0); TAILQ_FOREACH_SAFE(uq, &uh->head, uq_link, uq_temp) { if ((uq->uq_bitset & bitset) == 0) continue; umtxq_remove_queue(uq, UMTX_SHARED_QUEUE); wakeup_one(uq); if (++ret >= n_wake) break; } return (ret); } /* * Wake up threads waiting on an userland object. */ static int umtxq_signal_queue(struct umtx_key *key, int n_wake, int q) { struct umtxq_queue *uh; struct umtx_q *uq; int ret; ret = 0; UMTXQ_LOCKED_ASSERT(umtxq_getchain(key)); uh = umtxq_queue_lookup(key, q); if (uh != NULL) { while ((uq = TAILQ_FIRST(&uh->head)) != NULL) { umtxq_remove_queue(uq, q); wakeup(uq); if (++ret >= n_wake) return (ret); } } return (ret); } /* * Wake up specified thread. */ static inline void umtxq_signal_thread(struct umtx_q *uq) { UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key)); umtxq_remove(uq); wakeup(uq); } /* * Wake up a maximum of n_wake threads that are waiting on an userland * object identified by key. The remaining threads are removed from queue * identified by key and added to the queue identified by key2 (requeued). * The n_requeue specifies an upper limit on the number of threads that * are requeued to the second queue. */ int umtxq_requeue(struct umtx_key *key, int n_wake, struct umtx_key *key2, int n_requeue) { struct umtxq_queue *uh, *uh2; struct umtx_q *uq, *uq_temp; int ret; ret = 0; UMTXQ_LOCKED_ASSERT(umtxq_getchain(key)); UMTXQ_LOCKED_ASSERT(umtxq_getchain(key2)); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); uh2 = umtxq_queue_lookup(key2, UMTX_SHARED_QUEUE); if (uh == NULL) return (0); TAILQ_FOREACH_SAFE(uq, &uh->head, uq_link, uq_temp) { if (++ret <= n_wake) { umtxq_remove(uq); wakeup_one(uq); } else { umtxq_remove(uq); uq->uq_key = *key2; umtxq_insert(uq); if (ret - n_wake == n_requeue) break; } } return (ret); } static inline int tstohz(const struct timespec *tsp) { struct timeval tv; TIMESPEC_TO_TIMEVAL(&tv, tsp); return tvtohz(&tv); } void umtx_abs_timeout_init(struct umtx_abs_timeout *timo, int clockid, int absolute, const struct timespec *timeout) { timo->clockid = clockid; if (!absolute) { timo->is_abs_real = false; umtx_abs_timeout_update(timo); timespecadd(&timo->cur, timeout, &timo->end); } else { timo->end = *timeout; timo->is_abs_real = clockid == CLOCK_REALTIME || clockid == CLOCK_REALTIME_FAST || clockid == CLOCK_REALTIME_PRECISE; /* * If is_abs_real, umtxq_sleep will read the clock * after setting td_rtcgen; otherwise, read it here. */ if (!timo->is_abs_real) { umtx_abs_timeout_update(timo); } } } static void umtx_abs_timeout_init2(struct umtx_abs_timeout *timo, const struct _umtx_time *umtxtime) { umtx_abs_timeout_init(timo, umtxtime->_clockid, (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout); } static void umtx_abs_timeout_update(struct umtx_abs_timeout *timo) { kern_clock_gettime(curthread, timo->clockid, &timo->cur); } static int umtx_abs_timeout_gethz(struct umtx_abs_timeout *timo) { struct timespec tts; if (timespeccmp(&timo->end, &timo->cur, <=)) return (-1); timespecsub(&timo->end, &timo->cur, &tts); return (tstohz(&tts)); } static uint32_t umtx_unlock_val(uint32_t flags, bool rb) { if (rb) return (UMUTEX_RB_OWNERDEAD); else if ((flags & UMUTEX_NONCONSISTENT) != 0) return (UMUTEX_RB_NOTRECOV); else return (UMUTEX_UNOWNED); } /* * Put thread into sleep state, before sleeping, check if * thread was removed from umtx queue. */ int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct umtx_abs_timeout *abstime) { struct umtxq_chain *uc; int error, timo; if (abstime != NULL && abstime->is_abs_real) { curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation); umtx_abs_timeout_update(abstime); } uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); for (;;) { if (!(uq->uq_flags & UQF_UMTXQ)) { error = 0; break; } if (abstime != NULL) { timo = umtx_abs_timeout_gethz(abstime); if (timo < 0) { error = ETIMEDOUT; break; } } else timo = 0; error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo); if (error == EINTR || error == ERESTART) { umtxq_lock(&uq->uq_key); break; } if (abstime != NULL) { if (abstime->is_abs_real) curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation); umtx_abs_timeout_update(abstime); } umtxq_lock(&uq->uq_key); } curthread->td_rtcgen = 0; return (error); } /* * Convert userspace address into unique logical address. */ int umtx_key_get(const void *addr, int type, int share, struct umtx_key *key) { struct thread *td = curthread; vm_map_t map; vm_map_entry_t entry; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; key->type = type; if (share == THREAD_SHARE) { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } else { MPASS(share == PROCESS_SHARE || share == AUTO_SHARE); map = &td->td_proc->p_vmspace->vm_map; if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE, &entry, &key->info.shared.object, &pindex, &prot, &wired) != KERN_SUCCESS) { return (EFAULT); } if ((share == PROCESS_SHARE) || (share == AUTO_SHARE && VM_INHERIT_SHARE == entry->inheritance)) { key->shared = 1; key->info.shared.offset = (vm_offset_t)addr - entry->start + entry->offset; vm_object_reference(key->info.shared.object); } else { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } vm_map_lookup_done(map, entry); } umtxq_hash(key); return (0); } /* * Release key. */ void umtx_key_release(struct umtx_key *key) { if (key->shared) vm_object_deallocate(key->info.shared.object); } #ifdef COMPAT_FREEBSD10 /* * Lock a umtx object. */ static int do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, const struct timespec *timeout) { struct umtx_abs_timeout timo; struct umtx_q *uq; u_long owner; u_long old; int error = 0; uq = td->td_umtxq; if (timeout != NULL) umtx_abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id); /* The acquire succeeded. */ if (owner == UMTX_UNOWNED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); /* If no one owns it but it is contested try to acquire it. */ if (owner == UMTX_CONTESTED) { owner = casuword(&umtx->u_owner, UMTX_CONTESTED, id | UMTX_CONTESTED); if (owner == UMTX_CONTESTED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); error = thread_check_susp(td, false); if (error != 0) break; /* If this failed the lock has changed, restart. */ continue; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE, &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED); /* The address was invalid. */ if (old == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0) error = thread_check_susp(td, false); } if (timeout == NULL) { /* Mutex locking is restarted if it is interrupted. */ if (error == EINTR) error = ERESTART; } else { /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a umtx object. */ static int do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id) { struct umtx_key key; u_long owner; u_long old; int error; int count; /* * Make sure we own this mtx. */ owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner)); if (owner == -1) return (EFAULT); if ((owner & ~UMTX_CONTESTED) != id) return (EPERM); /* This should be done in userland */ if ((owner & UMTX_CONTESTED) == 0) { old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED); if (old == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE, &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ old = casuword(&umtx->u_owner, owner, count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED); umtxq_lock(&key); umtxq_signal(&key,1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (old == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } #ifdef COMPAT_FREEBSD32 /* * Lock a umtx object. */ static int do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, const struct timespec *timeout) { struct umtx_abs_timeout timo; struct umtx_q *uq; uint32_t owner; uint32_t old; int error = 0; uq = td->td_umtxq; if (timeout != NULL) umtx_abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ owner = casuword32(m, UMUTEX_UNOWNED, id); /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED) { owner = casuword32(m, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); if (owner == UMUTEX_CONTESTED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); error = thread_check_susp(td, false); if (error != 0) break; /* If this failed the lock has changed, restart. */ continue; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE, &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ old = casuword32(m, owner, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (old == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0) error = thread_check_susp(td, false); } if (timeout == NULL) { /* Mutex locking is restarted if it is interrupted. */ if (error == EINTR) error = ERESTART; } else { /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a umtx object. */ static int do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id) { struct umtx_key key; uint32_t owner; uint32_t old; int error; int count; /* * Make sure we own this mtx. */ owner = fuword32(m); if (owner == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { old = casuword32(m, owner, UMUTEX_UNOWNED); if (old == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE, &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ old = casuword32(m, owner, count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED); umtxq_lock(&key); umtxq_signal(&key,1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (old == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } #endif /* COMPAT_FREEBSD32 */ #endif /* COMPAT_FREEBSD10 */ /* * Fetch and compare value, sleep on the address if value is not changed. */ static int do_wait(struct thread *td, void *addr, u_long id, struct _umtx_time *timeout, int compat32, int is_private) { struct umtx_abs_timeout timo; struct umtx_q *uq; u_long tmp; uint32_t tmp32; int error = 0; uq = td->td_umtxq; if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0) return (error); if (timeout != NULL) umtx_abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); if (compat32 == 0) { error = fueword(addr, &tmp); if (error != 0) error = EFAULT; } else { error = fueword32(addr, &tmp32); if (error == 0) tmp = tmp32; else error = EFAULT; } umtxq_lock(&uq->uq_key); if (error == 0) { if (tmp == id) error = umtxq_sleep(uq, "uwait", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else umtxq_remove(uq); } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { umtxq_remove(uq); } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } /* * Wake up threads sleeping on the specified address. */ int kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private) { struct umtx_key key; int ret; if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0) return (ret); umtxq_lock(&key); umtxq_signal(&key, n_wake); umtxq_unlock(&key); umtx_key_release(&key); return (0); } /* * Lock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int mode) { struct umtx_abs_timeout timo; struct umtx_q *uq; uint32_t owner, old, id; int error, rv; id = td->td_tid; uq = td->td_umtxq; error = 0; if (timeout != NULL) umtx_abs_timeout_init2(&timo, timeout); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { rv = fueword32(&m->m_owner, &owner); if (rv == -1) return (EFAULT); if (mode == _UMUTEX_WAIT) { if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV) return (0); } else { /* * Robust mutex terminated. Kernel duty is to * return EOWNERDEAD to the userspace. The * umutex.m_flags UMUTEX_NONCONSISTENT is set * by the common userspace code. */ if (owner == UMUTEX_RB_OWNERDEAD) { rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD, &owner, id | UMUTEX_CONTESTED); if (rv == -1) return (EFAULT); if (rv == 0) { MPASS(owner == UMUTEX_RB_OWNERDEAD); return (EOWNERDEAD); /* success */ } MPASS(rv == 1); rv = thread_check_susp(td, false); if (rv != 0) return (rv); continue; } if (owner == UMUTEX_RB_NOTRECOV) return (ENOTRECOVERABLE); /* * Try the uncontested case. This should be * done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) return (EFAULT); /* The acquire succeeded. */ if (rv == 0) { MPASS(owner == UMUTEX_UNOWNED); return (0); } /* * If no one owns it but it is contested try * to acquire it. */ MPASS(rv == 1); if (owner == UMUTEX_CONTESTED) { rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) return (EFAULT); if (rv == 0) { MPASS(owner == UMUTEX_CONTESTED); return (0); } if (rv == 1) { rv = thread_check_susp(td, false); if (rv != 0) return (rv); } /* * If this failed the lock has * changed, restart. */ continue; } /* rv == 1 but not contested, likely store failure */ rv = thread_check_susp(td, false); if (rv != 0) return (rv); } if (mode == _UMUTEX_TRY) return (EBUSY); /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid or casueword failed to store. */ if (rv == -1 || rv == 1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (rv == -1) return (EFAULT); if (rv == 1) { rv = thread_check_susp(td, false); if (rv != 0) return (rv); } continue; } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); MPASS(old == owner); error = umtxq_sleep(uq, "umtxn", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0) error = thread_check_susp(td, false); } return (0); } /* * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; uint32_t owner, old, id, newlock; int error, count; id = td->td_tid; again: /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); newlock = umtx_unlock_val(flags, rb); if ((owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, newlock); if (error == -1) return (EFAULT); if (error == 1) { error = thread_check_susp(td, false); if (error != 0) return (error); goto again; } MPASS(old == owner); return (0); } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ if (count > 1) newlock |= UMUTEX_CONTESTED; error = casueword32(&m->m_owner, owner, &old, newlock); umtxq_lock(&key); umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (error == 1) { if (old != owner) return (EINVAL); error = thread_check_susp(td, false); if (error != 0) return (error); goto again; } return (0); } /* * Check if the mutex is available and wake up a waiter, * only for simple mutex. */ static int do_wake_umutex(struct thread *td, struct umutex *m) { struct umtx_key key; uint32_t owner; uint32_t flags; int error; int count; again: error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD && owner != UMUTEX_RB_NOTRECOV) return (0); error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD && owner != UMUTEX_RB_NOTRECOV) { error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, UMUTEX_UNOWNED); if (error == -1) { error = EFAULT; } else if (error == 1) { umtxq_lock(&key); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); error = thread_check_susp(td, false); if (error != 0) return (error); goto again; } } umtxq_lock(&key); if (error == 0 && count != 0) { MPASS((owner & ~UMUTEX_CONTESTED) == 0 || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV); umtxq_signal(&key, 1); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } /* * Check if the mutex has waiters and tries to fix contention bit. */ static int do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; uint32_t owner, old; int type; int error; int count; switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST)) { case 0: case UMUTEX_ROBUST: type = TYPE_NORMAL_UMUTEX; break; case UMUTEX_PRIO_INHERIT: type = TYPE_PI_UMUTEX; break; case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST): type = TYPE_PI_ROBUST_UMUTEX; break; case UMUTEX_PRIO_PROTECT: type = TYPE_PP_UMUTEX; break; case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST): type = TYPE_PP_ROBUST_UMUTEX; break; default: return (EINVAL); } if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0) return (error); owner = 0; umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; /* * Only repair contention bit if there is a waiter, this means * the mutex is still being referenced by userland code, * otherwise don't update any memory. */ while (error == 0 && (owner & UMUTEX_CONTESTED) == 0 && (count > 1 || (count == 1 && (owner & ~UMUTEX_CONTESTED) != 0))) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (error == 0) { MPASS(old == owner); break; } owner = old; error = thread_check_susp(td, false); } umtxq_lock(&key); if (error == EFAULT) { umtxq_signal(&key, INT_MAX); } else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV)) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } struct umtx_pi * umtx_pi_alloc(int flags) { struct umtx_pi *pi; pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags); TAILQ_INIT(&pi->pi_blocked); atomic_add_int(&umtx_pi_allocated, 1); return (pi); } void umtx_pi_free(struct umtx_pi *pi) { uma_zfree(umtx_pi_zone, pi); atomic_add_int(&umtx_pi_allocated, -1); } /* * Adjust the thread's position on a pi_state after its priority has been * changed. */ static int umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td) { struct umtx_q *uq, *uq1, *uq2; struct thread *td1; mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (0); uq = td->td_umtxq; /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq); uq2 = TAILQ_NEXT(uq, uq_lockq); if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) || (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { td1 = uq1->uq_thread; MPASS(td1->td_proc->p_magic == P_MAGIC); if (UPRI(td1) > UPRI(td)) break; } if (uq1 == NULL) TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); else TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); } return (1); } static struct umtx_pi * umtx_pi_next(struct umtx_pi *pi) { struct umtx_q *uq_owner; if (pi->pi_owner == NULL) return (NULL); uq_owner = pi->pi_owner->td_umtxq; if (uq_owner == NULL) return (NULL); return (uq_owner->uq_pi_blocked); } /* * Floyd's Cycle-Finding Algorithm. */ static bool umtx_pi_check_loop(struct umtx_pi *pi) { struct umtx_pi *pi1; /* fast iterator */ mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (false); pi1 = pi; for (;;) { pi = umtx_pi_next(pi); if (pi == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; if (pi == pi1) return (true); } return (false); } /* * Propagate priority when a thread is blocked on POSIX * PI mutex. */ static void umtx_propagate_priority(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; int pri; mtx_assert(&umtx_lock, MA_OWNED); pri = UPRI(td); uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) return; if (umtx_pi_check_loop(pi)) return; for (;;) { td = pi->pi_owner; if (td == NULL || td == curthread) return; MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); thread_lock(td); if (td->td_lend_user_pri > pri) sched_lend_user_prio(td, pri); else { thread_unlock(td); break; } thread_unlock(td); /* * Pick up the lock that td is blocked on. */ uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) break; /* Resort td on the list if needed. */ umtx_pi_adjust_thread(pi, td); } } /* * Unpropagate priority for a PI mutex when a thread blocked on * it is interrupted by signal or resumed by others. */ static void umtx_repropagate_priority(struct umtx_pi *pi) { struct umtx_q *uq, *uq_owner; struct umtx_pi *pi2; int pri; mtx_assert(&umtx_lock, MA_OWNED); if (umtx_pi_check_loop(pi)) return; while (pi != NULL && pi->pi_owner != NULL) { pri = PRI_MAX; uq_owner = pi->pi_owner->td_umtxq; TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) { uq = TAILQ_FIRST(&pi2->pi_blocked); if (uq != NULL) { if (pri > UPRI(uq->uq_thread)) pri = UPRI(uq->uq_thread); } } if (pri > uq_owner->uq_inherited_pri) pri = uq_owner->uq_inherited_pri; thread_lock(pi->pi_owner); sched_lend_user_prio(pi->pi_owner, pri); thread_unlock(pi->pi_owner); if ((pi = uq_owner->uq_pi_blocked) != NULL) umtx_pi_adjust_thread(pi, uq_owner->uq_thread); } } /* * Insert a PI mutex into owned list. */ static void umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq_owner; uq_owner = owner->td_umtxq; mtx_assert(&umtx_lock, MA_OWNED); MPASS(pi->pi_owner == NULL); pi->pi_owner = owner; TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link); } /* * Disown a PI mutex, and remove it from the owned list. */ static void umtx_pi_disown(struct umtx_pi *pi) { mtx_assert(&umtx_lock, MA_OWNED); TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link); pi->pi_owner = NULL; } /* * Claim ownership of a PI mutex. */ int umtx_pi_claim(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq; int pri; mtx_lock(&umtx_lock); if (pi->pi_owner == owner) { mtx_unlock(&umtx_lock); return (0); } if (pi->pi_owner != NULL) { /* * userland may have already messed the mutex, sigh. */ mtx_unlock(&umtx_lock); return (EPERM); } umtx_pi_setowner(pi, owner); uq = TAILQ_FIRST(&pi->pi_blocked); if (uq != NULL) { pri = UPRI(uq->uq_thread); thread_lock(owner); if (pri < UPRI(owner)) sched_lend_user_prio(owner, pri); thread_unlock(owner); } mtx_unlock(&umtx_lock); return (0); } /* * Adjust a thread's order position in its blocked PI mutex, * this may result new priority propagating process. */ void umtx_pi_adjust(struct thread *td, u_char oldpri) { struct umtx_q *uq; struct umtx_pi *pi; uq = td->td_umtxq; mtx_lock(&umtx_lock); /* * Pick up the lock that td is blocked on. */ pi = uq->uq_pi_blocked; if (pi != NULL) { umtx_pi_adjust_thread(pi, td); umtx_repropagate_priority(pi); } mtx_unlock(&umtx_lock); } /* * Sleep on a PI mutex. */ int umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner, const char *wmesg, struct umtx_abs_timeout *timo, bool shared) { struct thread *td, *td1; struct umtx_q *uq1; int error, pri; #ifdef INVARIANTS struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); #endif error = 0; td = uq->uq_thread; KASSERT(td == curthread, ("inconsistent uq_thread")); UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key)); KASSERT(uc->uc_busy != 0, ("umtx chain is not busy")); umtxq_insert(uq); mtx_lock(&umtx_lock); if (pi->pi_owner == NULL) { mtx_unlock(&umtx_lock); td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid); mtx_lock(&umtx_lock); if (td1 != NULL) { if (pi->pi_owner == NULL) umtx_pi_setowner(pi, td1); PROC_UNLOCK(td1->td_proc); } } TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { pri = UPRI(uq1->uq_thread); if (pri > UPRI(td)) break; } if (uq1 != NULL) TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); else TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); uq->uq_pi_blocked = pi; thread_lock(td); td->td_flags |= TDF_UPIBLOCKED; thread_unlock(td); umtx_propagate_priority(td); mtx_unlock(&umtx_lock); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, wmesg, timo); umtxq_remove(uq); mtx_lock(&umtx_lock); uq->uq_pi_blocked = NULL; thread_lock(td); td->td_flags &= ~TDF_UPIBLOCKED; thread_unlock(td); TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); umtx_repropagate_priority(pi); mtx_unlock(&umtx_lock); umtxq_unlock(&uq->uq_key); return (error); } /* * Add reference count for a PI mutex. */ void umtx_pi_ref(struct umtx_pi *pi) { UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key)); pi->pi_refcount++; } /* * Decrease reference count for a PI mutex, if the counter * is decreased to zero, its memory space is freed. */ void umtx_pi_unref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(pi->pi_refcount > 0, ("invalid reference count")); if (--pi->pi_refcount == 0) { mtx_lock(&umtx_lock); if (pi->pi_owner != NULL) umtx_pi_disown(pi); KASSERT(TAILQ_EMPTY(&pi->pi_blocked), ("blocked queue not empty")); mtx_unlock(&umtx_lock); TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink); umtx_pi_free(pi); } } /* * Find a PI mutex in hash table. */ struct umtx_pi * umtx_pi_lookup(struct umtx_key *key) { struct umtxq_chain *uc; struct umtx_pi *pi; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) { if (umtx_key_match(&pi->pi_key, key)) { return (pi); } } return (NULL); } /* * Insert a PI mutex into hash table. */ void umtx_pi_insert(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink); } /* * Drop a PI mutex and wakeup a top waiter. */ int umtx_pi_drop(struct thread *td, struct umtx_key *key, bool rb, int *count) { struct umtx_q *uq_first, *uq_first2, *uq_me; struct umtx_pi *pi, *pi2; int pri; UMTXQ_ASSERT_LOCKED_BUSY(key); *count = umtxq_count_pi(key, &uq_first); if (uq_first != NULL) { mtx_lock(&umtx_lock); pi = uq_first->uq_pi_blocked; KASSERT(pi != NULL, ("pi == NULL?")); if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) { mtx_unlock(&umtx_lock); /* userland messed the mutex */ return (EPERM); } uq_me = td->td_umtxq; if (pi->pi_owner == td) umtx_pi_disown(pi); /* get highest priority thread which is still sleeping. */ uq_first = TAILQ_FIRST(&pi->pi_blocked); while (uq_first != NULL && (uq_first->uq_flags & UQF_UMTXQ) == 0) { uq_first = TAILQ_NEXT(uq_first, uq_lockq); } pri = PRI_MAX; TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) { uq_first2 = TAILQ_FIRST(&pi2->pi_blocked); if (uq_first2 != NULL) { if (pri > UPRI(uq_first2->uq_thread)) pri = UPRI(uq_first2->uq_thread); } } thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); if (uq_first) umtxq_signal_thread(uq_first); } else { pi = umtx_pi_lookup(key); /* * A umtx_pi can exist if a signal or timeout removed the * last waiter from the umtxq, but there is still * a thread in do_lock_pi() holding the umtx_pi. */ if (pi != NULL) { /* * The umtx_pi can be unowned, such as when a thread * has just entered do_lock_pi(), allocated the * umtx_pi, and unlocked the umtxq. * If the current thread owns it, it must disown it. */ mtx_lock(&umtx_lock); if (pi->pi_owner == td) umtx_pi_disown(pi); mtx_unlock(&umtx_lock); } } return (0); } /* * Lock a PI mutex. */ static int do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct umtx_abs_timeout timo; struct umtx_q *uq; struct umtx_pi *pi, *new_pi; uint32_t id, old_owner, owner, old; int error, rv; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) umtx_abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi == NULL) { new_pi = umtx_pi_alloc(M_NOWAIT); if (new_pi == NULL) { umtxq_unlock(&uq->uq_key); new_pi = umtx_pi_alloc(M_WAITOK); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi != NULL) { umtx_pi_free(new_pi); new_pi = NULL; } } if (new_pi != NULL) { new_pi->pi_key = uq->uq_key; umtx_pi_insert(new_pi); pi = new_pi; } } umtx_pi_ref(pi); umtxq_unlock(&uq->uq_key); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } /* The acquire succeeded. */ if (rv == 0) { MPASS(owner == UMUTEX_UNOWNED); error = 0; break; } if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; break; } /* * Nobody owns it, but the acquire failed. This can happen * with ll/sc atomics. */ if (owner == UMUTEX_UNOWNED) { error = thread_check_susp(td, true); if (error != 0) break; continue; } /* * Avoid overwriting a possible error from sleep due * to the pending signal with suspension check result. */ if (error == 0) { error = thread_check_susp(td, true); if (error != 0) break; } /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) { old_owner = owner; rv = casueword32(&m->m_owner, owner, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (rv == 1) { if (error == 0) { error = thread_check_susp(td, true); if (error != 0) break; } /* * If this failed the lock could * changed, restart. */ continue; } MPASS(rv == 0); MPASS(owner == old_owner); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); error = umtx_pi_claim(pi, td); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); if (error != 0) { /* * Since we're going to return an * error, restore the m_owner to its * previous, unowned state to avoid * compounding the problem. */ (void)casuword32(&m->m_owner, id | UMUTEX_CONTESTED, old_owner); } if (error == 0 && old_owner == UMUTEX_RB_OWNERDEAD) error = EOWNERDEAD; break; } if ((owner & ~UMUTEX_CONTESTED) == id) { error = EDEADLK; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } if (rv == 1) { umtxq_unbusy_unlocked(&uq->uq_key); error = thread_check_susp(td, true); if (error != 0) break; /* * The lock changed and we need to retry or we * lost a race to the thread unlocking the * umtx. Note that the UMUTEX_RB_OWNERDEAD * value for owner is impossible there. */ continue; } umtxq_lock(&uq->uq_key); /* We set the contested bit, sleep. */ MPASS(old == owner); error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED, "umtxpi", timeout == NULL ? NULL : &timo, (flags & USYNC_PROCESS_SHARED) != 0); if (error != 0) continue; error = thread_check_susp(td, false); if (error != 0) break; } umtxq_lock(&uq->uq_key); umtx_pi_unref(pi); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PI mutex. */ static int do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; uint32_t id, new_owner, old, owner; int count, error; id = td->td_tid; usrloop: /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); new_owner = umtx_unlock_val(flags, rb); /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, new_owner); if (error == -1) return (EFAULT); if (error == 1) { error = thread_check_susp(td, true); if (error != 0) return (error); goto usrloop; } if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); error = umtx_pi_drop(td, &key, rb, &count); if (error != 0) { umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); /* userland messed the mutex */ return (error); } umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ if (count > 1) new_owner |= UMUTEX_CONTESTED; again: error = casueword32(&m->m_owner, owner, &old, new_owner); if (error == 1) { error = thread_check_susp(td, false); if (error == 0) goto again; } umtxq_unbusy_unlocked(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (error == 0 && old != owner) return (EINVAL); return (error); } /* * Lock a PP mutex. */ static int do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct umtx_abs_timeout timo; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t ceiling; uint32_t owner, id; int error, pri, old_inherited_pri, su, rv; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) umtx_abs_timeout_init2(&timo, timeout); su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); for (;;) { old_inherited_pri = uq->uq_inherited_pri; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &ceiling); if (rv == -1) { error = EFAULT; goto out; } ceiling = RTP_PRIO_MAX - ceiling; if (ceiling > RTP_PRIO_MAX) { error = EINVAL; goto out; } mtx_lock(&umtx_lock); if (UPRI(td) < PRI_MIN_REALTIME + ceiling) { mtx_unlock(&umtx_lock); error = EINVAL; goto out; } if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) { uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling; thread_lock(td); if (uq->uq_inherited_pri < UPRI(td)) sched_lend_user_prio(td, uq->uq_inherited_pri); thread_unlock(td); } mtx_unlock(&umtx_lock); rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (rv == 0) { MPASS(owner == UMUTEX_CONTESTED); error = 0; break; } /* rv == 1 */ if (owner == UMUTEX_RB_OWNERDEAD) { rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD, &owner, id | UMUTEX_CONTESTED); if (rv == -1) { error = EFAULT; break; } if (rv == 0) { MPASS(owner == UMUTEX_RB_OWNERDEAD); error = EOWNERDEAD; /* success */ break; } /* * rv == 1, only check for suspension if we * did not already catched a signal. If we * get an error from the check, the same * condition is checked by the umtxq_sleep() * call below, so we should obliterate the * error to not skip the last loop iteration. */ if (error == 0) { error = thread_check_susp(td, false); if (error == 0) { if (try != 0) error = EBUSY; else continue; } error = 0; } } else if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; } if (try != 0) error = EBUSY; /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } if (error != 0 && error != EOWNERDEAD) { mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } out: umtxq_unbusy_unlocked(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PP mutex. */ static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t id, owner, rceiling; int error, pri, new_inherited_pri, su; id = td->td_tid; uq = td->td_umtxq; su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t)); if (error != 0) return (error); if (rceiling == -1) new_inherited_pri = PRI_MAX; else { rceiling = RTP_PRIO_MAX - rceiling; if (rceiling > RTP_PRIO_MAX) return (EINVAL); new_inherited_pri = PRI_MIN_REALTIME + rceiling; } if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_unlock(&key); /* * For priority protected mutex, always set unlocked state * to UMUTEX_CONTESTED, so that userland always enters kernel * to lock the mutex, it is necessary because thread priority * has to be adjusted for such mutex. */ error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) | UMUTEX_CONTESTED); umtxq_lock(&key); if (error == 0) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); if (error == -1) error = EFAULT; else { mtx_lock(&umtx_lock); if (su != 0) uq->uq_inherited_pri = new_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } umtx_key_release(&key); return (error); } static int do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling, uint32_t *old_ceiling) { struct umtx_q *uq; uint32_t flags, id, owner, save_ceiling; int error, rv, rv1; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); if ((flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); if (ceiling > RTP_PRIO_MAX) return (EINVAL); id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); for (;;) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &save_ceiling); if (rv == -1) { error = EFAULT; break; } rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); if (rv == -1) { error = EFAULT; break; } if (rv == 0) { MPASS(owner == UMUTEX_CONTESTED); rv = suword32(&m->m_ceilings[0], ceiling); rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED); error = (rv == 0 && rv1 == 0) ? 0: EFAULT; break; } if ((owner & ~UMUTEX_CONTESTED) == id) { rv = suword32(&m->m_ceilings[0], ceiling); error = rv == 0 ? 0 : EFAULT; break; } if (owner == UMUTEX_RB_OWNERDEAD) { error = EOWNERDEAD; break; } else if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", NULL); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } umtxq_lock(&uq->uq_key); if (error == 0) umtxq_signal(&uq->uq_key, INT_MAX); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0 && old_ceiling != NULL) { rv = suword32(old_ceiling, save_ceiling); error = rv == 0 ? 0 : EFAULT; } return (error); } /* * Lock a userland POSIX mutex. */ static int do_lock_umutex(struct thread *td, struct umutex *m, struct _umtx_time *timeout, int mode) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: error = do_lock_normal(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_INHERIT: error = do_lock_pi(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_PROTECT: error = do_lock_pp(td, m, flags, timeout, mode); break; default: return (EINVAL); } if (timeout == NULL) { if (error == EINTR && mode != _UMUTEX_WAIT) error = ERESTART; } else { /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a userland POSIX mutex. */ static int do_unlock_umutex(struct thread *td, struct umutex *m, bool rb) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: return (do_unlock_normal(td, m, flags, rb)); case UMUTEX_PRIO_INHERIT: return (do_unlock_pi(td, m, flags, rb)); case UMUTEX_PRIO_PROTECT: return (do_unlock_pp(td, m, flags, rb)); } return (EINVAL); } static int do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m, struct timespec *timeout, u_long wflags) { struct umtx_abs_timeout timo; struct umtx_q *uq; uint32_t flags, clockid, hasw; int error; uq = td->td_umtxq; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if ((wflags & CVWAIT_CLOCKID) != 0) { error = fueword32(&cv->c_clockid, &clockid); if (error == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (clockid < CLOCK_REALTIME || clockid >= CLOCK_THREAD_CPUTIME_ID) { /* hmm, only HW clock id will work. */ umtx_key_release(&uq->uq_key); return (EINVAL); } } else { clockid = CLOCK_REALTIME; } umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set c_has_waiters to 1 before releasing user mutex, also * don't modify cache line when unnecessary. */ error = fueword32(&cv->c_has_waiters, &hasw); if (error == 0 && hasw == 0) suword32(&cv->c_has_waiters, 1); umtxq_unbusy_unlocked(&uq->uq_key); error = do_unlock_umutex(td, m, false); if (timeout != NULL) umtx_abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0, timeout); umtxq_lock(&uq->uq_key); if (error == 0) { error = umtxq_sleep(uq, "ucond", timeout == NULL ? NULL : &timo); } if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { /* * This must be timeout,interrupted by signal or * surprious wakeup, clear c_has_waiter flag when * necessary. */ umtxq_busy(&uq->uq_key); if ((uq->uq_flags & UQF_UMTXQ) != 0) { int oldlen = uq->uq_cur_queue->length; umtxq_remove(uq); if (oldlen == 1) { umtxq_unlock(&uq->uq_key); suword32(&cv->c_has_waiters, 0); umtxq_lock(&uq->uq_key); } } umtxq_unbusy(&uq->uq_key); if (error == ERESTART) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland condition variable. */ static int do_cv_signal(struct thread *td, struct ucond *cv) { struct umtx_key key; int error, cnt, nwake; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); nwake = umtxq_signal(&key, 1); if (cnt <= nwake) { umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_lock(&key); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static int do_cv_broadcast(struct thread *td, struct ucond *cv) { struct umtx_key key; int error; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_signal(&key, INT_MAX); umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_unbusy_unlocked(&key); umtx_key_release(&key); return (error); } static int do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout) { struct umtx_abs_timeout timo; struct umtx_q *uq; uint32_t flags, wrflags; int32_t state, oldstate; int32_t blocked_readers; int error, error1, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) umtx_abs_timeout_init2(&timo, timeout); wrflags = URWLOCK_WRITE_OWNER; if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER)) wrflags |= URWLOCK_WRITE_WAITERS; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } /* try to lock it */ while (!(state & wrflags)) { if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) { umtx_key_release(&uq->uq_key); return (EAGAIN); } rv = casueword32(&rwlock->rw_state, state, &oldstate, state + 1); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (rv == 0) { MPASS(oldstate == state); umtx_key_release(&uq->uq_key); return (0); } error = thread_check_susp(td, true); if (error != 0) break; state = oldstate; } if (error) break; /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; /* set read contention bit */ while (error == 0 && (state & wrflags) && !(state & URWLOCK_READ_WAITERS)) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (rv == 0) { MPASS(oldstate == state); goto sleep; } state = oldstate; error = thread_check_susp(td, false); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } /* state is changed while setting flags, restart */ if (!(state & wrflags)) { umtxq_unbusy_unlocked(&uq->uq_key); error = thread_check_susp(td, true); if (error != 0) break; continue; } sleep: /* * Contention bit is set, before sleeping, increase * read waiter count. */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers+1); while (state & wrflags) { umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "urdlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } /* decrease read waiter count, and may clear read contention bit */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers-1); if (blocked_readers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (rv == 0) { MPASS(oldstate == state); break; } state = oldstate; error1 = thread_check_susp(td, false); if (error1 != 0) { if (error == 0) error = error1; break; } } } umtxq_unbusy_unlocked(&uq->uq_key); if (error != 0) break; } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout) { struct umtx_abs_timeout timo; struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int32_t blocked_writers; int32_t blocked_readers; int error, error1, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) umtx_abs_timeout_init2(&timo, timeout); blocked_readers = 0; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } while ((state & URWLOCK_WRITE_OWNER) == 0 && URWLOCK_READER_COUNT(state) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_OWNER); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (rv == 0) { MPASS(oldstate == state); umtx_key_release(&uq->uq_key); return (0); } state = oldstate; error = thread_check_susp(td, true); if (error != 0) break; } if (error) { if ((state & (URWLOCK_WRITE_OWNER | URWLOCK_WRITE_WAITERS)) == 0 && blocked_readers != 0) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } break; } /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Re-read the state, in case it changed between the * try-lock above and the check below. */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; while (error == 0 && ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) && (state & URWLOCK_WRITE_WAITERS) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (rv == 0) { MPASS(oldstate == state); goto sleep; } state = oldstate; error = thread_check_susp(td, false); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } if ((state & URWLOCK_WRITE_OWNER) == 0 && URWLOCK_READER_COUNT(state) == 0) { umtxq_unbusy_unlocked(&uq->uq_key); error = thread_check_susp(td, false); if (error != 0) break; continue; } sleep: rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers + 1); while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) { umtxq_lock(&uq->uq_key); umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "uwrlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers-1); if (blocked_writers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (rv == 0) { MPASS(oldstate == state); break; } state = oldstate; error1 = thread_check_susp(td, false); /* * We are leaving the URWLOCK_WRITE_WAITERS * behind, but this should not harm the * correctness. */ if (error1 != 0) { if (error == 0) error = error1; break; } } rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } } else blocked_readers = 0; umtxq_unbusy_unlocked(&uq->uq_key); } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_unlock(struct thread *td, struct urwlock *rwlock) { struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int error, rv, q, count; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); error = fueword32(&rwlock->rw_state, &state); if (error == -1) { error = EFAULT; goto out; } if (state & URWLOCK_WRITE_OWNER) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_OWNER); if (rv == -1) { error = EFAULT; goto out; } if (rv == 1) { state = oldstate; if (!(oldstate & URWLOCK_WRITE_OWNER)) { error = EPERM; goto out; } error = thread_check_susp(td, true); if (error != 0) goto out; } else break; } } else if (URWLOCK_READER_COUNT(state) != 0) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state - 1); if (rv == -1) { error = EFAULT; goto out; } if (rv == 1) { state = oldstate; if (URWLOCK_READER_COUNT(oldstate) == 0) { error = EPERM; goto out; } error = thread_check_susp(td, true); if (error != 0) goto out; } else break; } } else { error = EPERM; goto out; } count = 0; if (!(flags & URWLOCK_PREFER_READER)) { if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } else if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } } else { if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } else if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } } if (count) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, count, q); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } out: umtx_key_release(&uq->uq_key); return (error); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout) { struct umtx_abs_timeout timo; struct umtx_q *uq; uint32_t flags, count, count1; int error, rv, rv1; uq = td->td_umtxq; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) umtx_abs_timeout_init2(&timo, timeout); again: umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = casueword32(&sem->_has_waiters, 0, &count1, 1); if (rv == 0) rv1 = fueword32(&sem->_count, &count); if (rv == -1 || (rv == 0 && (rv1 == -1 || count != 0)) || (rv == 1 && count1 == 0)) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); if (rv == 1) { rv = thread_check_susp(td, true); if (rv == 0) goto again; error = rv; goto out; } if (rv == 0) rv = rv1; error = rv == -1 ? EFAULT : 0; goto out; } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); /* A relative timeout cannot be restarted. */ if (error == ERESTART && timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) error = EINTR; } umtxq_unlock(&uq->uq_key); out: umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem_wake(struct thread *td, struct _usem *sem) { struct umtx_key key; int error, cnt; uint32_t flags; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { /* * Check if count is greater than 0, this means the memory is * still being referenced by user code, so we can safely * update _has_waiters flag. */ if (cnt == 1) { umtxq_unlock(&key); error = suword32(&sem->_has_waiters, 0); umtxq_lock(&key); if (error == -1) error = EFAULT; } umtxq_signal(&key, 1); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } #endif static int do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout) { struct umtx_abs_timeout timo; struct umtx_q *uq; uint32_t count, flags; int error, rv; uq = td->td_umtxq; flags = fuword32(&sem->_flags); if (timeout != NULL) umtx_abs_timeout_init2(&timo, timeout); again: error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = fueword32(&sem->_count, &count); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } for (;;) { if (USEM_COUNT(count) != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (0); } if (count == USEM_HAS_WAITERS) break; rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS); if (rv == 0) break; umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (rv == -1) return (EFAULT); rv = thread_check_susp(td, true); if (rv != 0) return (rv); goto again; } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) { /* A relative timeout cannot be restarted. */ if (error == ERESTART) error = EINTR; if (error == EINTR) { umtx_abs_timeout_update(&timo); timespecsub(&timo.end, &timo.cur, &timeout->_timeout); } } } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem2_wake(struct thread *td, struct _usem2 *sem) { struct umtx_key key; int error, cnt, rv; uint32_t count, flags; rv = fueword32(&sem->_flags, &flags); if (rv == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { /* * If this was the last sleeping thread, clear the waiters * flag in _count. */ if (cnt == 1) { umtxq_unlock(&key); rv = fueword32(&sem->_count, &count); while (rv != -1 && count & USEM_HAS_WAITERS) { rv = casueword32(&sem->_count, count, &count, count & ~USEM_HAS_WAITERS); if (rv == 1) { rv = thread_check_susp(td, true); if (rv != 0) break; } } if (rv == -1) error = EFAULT; else if (rv > 0) { error = rv; } umtxq_lock(&key); } umtxq_signal(&key, 1); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } #ifdef COMPAT_FREEBSD10 int freebsd10__umtx_lock(struct thread *td, struct freebsd10__umtx_lock_args *uap) { return (do_lock_umtx(td, uap->umtx, td->td_tid, 0)); } int freebsd10__umtx_unlock(struct thread *td, struct freebsd10__umtx_unlock_args *uap) { return (do_unlock_umtx(td, uap->umtx, td->td_tid)); } #endif inline int umtx_copyin_timeout(const void *uaddr, struct timespec *tsp) { int error; error = copyin(uaddr, tsp, sizeof(*tsp)); if (error == 0) { if (!timespecvalid_interval(tsp)) error = EINVAL; } return (error); } static inline int umtx_copyin_umtx_time(const void *uaddr, size_t size, struct _umtx_time *tp) { int error; if (size <= sizeof(tp->_timeout)) { tp->_clockid = CLOCK_REALTIME; tp->_flags = 0; error = copyin(uaddr, &tp->_timeout, sizeof(tp->_timeout)); } else error = copyin(uaddr, tp, sizeof(*tp)); if (error != 0) return (error); if (!timespecvalid_interval(&tp->_timeout)) return (EINVAL); return (0); } static int umtx_copyin_robust_lists(const void *uaddr, size_t size, struct umtx_robust_lists_params *rb) { if (size > sizeof(*rb)) return (EINVAL); return (copyin(uaddr, rb, size)); } static int umtx_copyout_timeout(void *uaddr, size_t sz, struct timespec *tsp) { /* * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time) * and we're only called if sz >= sizeof(timespec) as supplied in the * copyops. */ KASSERT(sz >= sizeof(*tsp), ("umtx_copyops specifies incorrect sizes")); return (copyout(tsp, uaddr, sizeof(*tsp))); } #ifdef COMPAT_FREEBSD10 static int __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = ops->copyin_timeout(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } #ifdef COMPAT_FREEBSD32 if (ops->compat32) return (do_lock_umtx32(td, uap->obj, uap->val, ts)); #endif return (do_lock_umtx(td, uap->obj, uap->val, ts)); } static int __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { #ifdef COMPAT_FREEBSD32 if (ops->compat32) return (do_unlock_umtx32(td, uap->obj, uap->val)); #endif return (do_unlock_umtx(td, uap->obj, uap->val)); } #endif /* COMPAT_FREEBSD10 */ #if !defined(COMPAT_FREEBSD10) static int __umtx_op_unimpl(struct thread *td __unused, struct _umtx_op_args *uap __unused, const struct umtx_copyops *ops __unused) { return (EOPNOTSUPP); } #endif /* COMPAT_FREEBSD10 */ static int __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = ops->copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, ops->compat32, 0)); } static int __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = ops->copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0)); } static int __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = ops->copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1)); } static int __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (kern_umtx_wake(td, uap->obj, uap->val, 0)); } #define BATCH_SIZE 128 static int __umtx_op_nwake_private_native(struct thread *td, struct _umtx_op_args *uap) { char *uaddrs[BATCH_SIZE], **upp; int count, error, i, pos, tocopy; upp = (char **)uap->obj; error = 0; for (count = uap->val, pos = 0; count > 0; count -= tocopy, pos += tocopy) { tocopy = MIN(count, BATCH_SIZE); error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *)); if (error != 0) break; for (i = 0; i < tocopy; ++i) { kern_umtx_wake(td, uaddrs[i], INT_MAX, 1); } maybe_yield(); } return (error); } static int __umtx_op_nwake_private_compat32(struct thread *td, struct _umtx_op_args *uap) { uint32_t uaddrs[BATCH_SIZE], *upp; int count, error, i, pos, tocopy; upp = (uint32_t *)uap->obj; error = 0; for (count = uap->val, pos = 0; count > 0; count -= tocopy, pos += tocopy) { tocopy = MIN(count, BATCH_SIZE); error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t)); if (error != 0) break; for (i = 0; i < tocopy; ++i) { kern_umtx_wake(td, (void *)(uintptr_t)uaddrs[i], INT_MAX, 1); } maybe_yield(); } return (error); } static int __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { if (ops->compat32) return (__umtx_op_nwake_private_compat32(td, uap)); return (__umtx_op_nwake_private_native(td, uap)); } static int __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (kern_umtx_wake(td, uap->obj, uap->val, 1)); } static int __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = ops->copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, 0)); } static int __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY)); } static int __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = ops->copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT)); } static int __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_wake_umutex(td, uap->obj)); } static int __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_unlock_umutex(td, uap->obj, false)); } static int __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1)); } static int __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = ops->copyin_timeout(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_cv_signal(td, uap->obj)); } static int __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_cv_broadcast(td, uap->obj)); } static int __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = ops->copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = ops->copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_rw_unlock(td, uap->obj)); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = ops->copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } static int __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_sem_wake(td, uap->obj)); } #endif static int __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_wake2_umutex(td, uap->obj, uap->val)); } static int __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct _umtx_time *tm_p, timeout; size_t uasize; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { uasize = 0; tm_p = NULL; } else { uasize = (size_t)uap->uaddr1; error = ops->copyin_umtx_time(uap->uaddr2, uasize, &timeout); if (error != 0) return (error); tm_p = &timeout; } error = do_sem2_wait(td, uap->obj, tm_p); if (error == EINTR && uap->uaddr2 != NULL && (timeout._flags & UMTX_ABSTIME) == 0 && uasize >= ops->umtx_time_sz + ops->timespec_sz) { error = ops->copyout_timeout( (void *)((uintptr_t)uap->uaddr2 + ops->umtx_time_sz), uasize - ops->umtx_time_sz, &timeout._timeout); if (error == 0) { error = EINTR; } } return (error); } static int __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (do_sem2_wake(td, uap->obj)); } #define USHM_OBJ_UMTX(o) \ ((struct umtx_shm_obj_list *)(&(o)->umtx_data)) #define USHMF_REG_LINKED 0x0001 #define USHMF_OBJ_LINKED 0x0002 struct umtx_shm_reg { TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link; LIST_ENTRY(umtx_shm_reg) ushm_obj_link; struct umtx_key ushm_key; struct ucred *ushm_cred; struct shmfd *ushm_obj; u_int ushm_refcnt; u_int ushm_flags; }; LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg); TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg); static uma_zone_t umtx_shm_reg_zone; static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS]; static struct mtx umtx_shm_lock; static struct umtx_shm_reg_head umtx_shm_reg_delfree = TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree); static void umtx_shm_free_reg(struct umtx_shm_reg *reg); static void umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused) { struct umtx_shm_reg_head d; struct umtx_shm_reg *reg, *reg1; TAILQ_INIT(&d); mtx_lock(&umtx_shm_lock); TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link); mtx_unlock(&umtx_shm_lock); TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) { TAILQ_REMOVE(&d, reg, ushm_reg_link); umtx_shm_free_reg(reg); } } static struct task umtx_shm_reg_delfree_task = TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL); static struct umtx_shm_reg * umtx_shm_find_reg_locked(const struct umtx_key *key) { struct umtx_shm_reg *reg; struct umtx_shm_reg_head *reg_head; KASSERT(key->shared, ("umtx_p_find_rg: private key")); mtx_assert(&umtx_shm_lock, MA_OWNED); reg_head = &umtx_shm_registry[key->hash]; TAILQ_FOREACH(reg, reg_head, ushm_reg_link) { KASSERT(reg->ushm_key.shared, ("non-shared key on reg %p %d", reg, reg->ushm_key.shared)); if (reg->ushm_key.info.shared.object == key->info.shared.object && reg->ushm_key.info.shared.offset == key->info.shared.offset) { KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM")); KASSERT(reg->ushm_refcnt > 0, ("reg %p refcnt 0 onlist", reg)); KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0, ("reg %p not linked", reg)); reg->ushm_refcnt++; return (reg); } } return (NULL); } static struct umtx_shm_reg * umtx_shm_find_reg(const struct umtx_key *key) { struct umtx_shm_reg *reg; mtx_lock(&umtx_shm_lock); reg = umtx_shm_find_reg_locked(key); mtx_unlock(&umtx_shm_lock); return (reg); } static void umtx_shm_free_reg(struct umtx_shm_reg *reg) { chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0); crfree(reg->ushm_cred); shm_drop(reg->ushm_obj); uma_zfree(umtx_shm_reg_zone, reg); } static bool umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force) { bool res; mtx_assert(&umtx_shm_lock, MA_OWNED); KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg)); reg->ushm_refcnt--; res = reg->ushm_refcnt == 0; if (res || force) { if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) { TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash], reg, ushm_reg_link); reg->ushm_flags &= ~USHMF_REG_LINKED; } if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) { LIST_REMOVE(reg, ushm_obj_link); reg->ushm_flags &= ~USHMF_OBJ_LINKED; } } return (res); } static void umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force) { vm_object_t object; bool dofree; if (force) { object = reg->ushm_obj->shm_object; VM_OBJECT_WLOCK(object); object->flags |= OBJ_UMTXDEAD; VM_OBJECT_WUNLOCK(object); } mtx_lock(&umtx_shm_lock); dofree = umtx_shm_unref_reg_locked(reg, force); mtx_unlock(&umtx_shm_lock); if (dofree) umtx_shm_free_reg(reg); } void umtx_shm_object_init(vm_object_t object) { LIST_INIT(USHM_OBJ_UMTX(object)); } void umtx_shm_object_terminated(vm_object_t object) { struct umtx_shm_reg *reg, *reg1; bool dofree; if (LIST_EMPTY(USHM_OBJ_UMTX(object))) return; dofree = false; mtx_lock(&umtx_shm_lock); LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) { if (umtx_shm_unref_reg_locked(reg, true)) { TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg, ushm_reg_link); dofree = true; } } mtx_unlock(&umtx_shm_lock); if (dofree) taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task); } static int umtx_shm_create_reg(struct thread *td, const struct umtx_key *key, struct umtx_shm_reg **res) { struct umtx_shm_reg *reg, *reg1; struct ucred *cred; int error; reg = umtx_shm_find_reg(key); if (reg != NULL) { *res = reg; return (0); } cred = td->td_ucred; if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP))) return (ENOMEM); reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); reg->ushm_refcnt = 1; bcopy(key, ®->ushm_key, sizeof(*key)); reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false); reg->ushm_cred = crhold(cred); error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); if (error != 0) { umtx_shm_free_reg(reg); return (error); } mtx_lock(&umtx_shm_lock); reg1 = umtx_shm_find_reg_locked(key); if (reg1 != NULL) { mtx_unlock(&umtx_shm_lock); umtx_shm_free_reg(reg); *res = reg1; return (0); } reg->ushm_refcnt++; TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link); LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg, ushm_obj_link); reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED; mtx_unlock(&umtx_shm_lock); *res = reg; return (0); } static int umtx_shm_alive(struct thread *td, void *addr) { vm_map_t map; vm_map_entry_t entry; vm_object_t object; vm_pindex_t pindex; vm_prot_t prot; int res, ret; boolean_t wired; map = &td->td_proc->p_vmspace->vm_map; res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry, &object, &pindex, &prot, &wired); if (res != KERN_SUCCESS) return (EFAULT); if (object == NULL) ret = EINVAL; else ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0; vm_map_lookup_done(map, entry); return (ret); } static void umtx_shm_init(void) { int i; umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF); for (i = 0; i < nitems(umtx_shm_registry); i++) TAILQ_INIT(&umtx_shm_registry[i]); } static int umtx_shm(struct thread *td, void *addr, u_int flags) { struct umtx_key key; struct umtx_shm_reg *reg; struct file *fp; int error, fd; if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP | UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1) return (EINVAL); if ((flags & UMTX_SHM_ALIVE) != 0) return (umtx_shm_alive(td, addr)); error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key); if (error != 0) return (error); KASSERT(key.shared == 1, ("non-shared key")); if ((flags & UMTX_SHM_CREAT) != 0) { error = umtx_shm_create_reg(td, &key, ®); } else { reg = umtx_shm_find_reg(&key); if (reg == NULL) error = ESRCH; } umtx_key_release(&key); if (error != 0) return (error); KASSERT(reg != NULL, ("no reg")); if ((flags & UMTX_SHM_DESTROY) != 0) { umtx_shm_unref_reg(reg, true); } else { #if 0 #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, reg->ushm_obj, FFLAGS(O_RDWR)); if (error == 0) #endif error = shm_access(reg->ushm_obj, td->td_ucred, FFLAGS(O_RDWR)); if (error == 0) #endif error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL); if (error == 0) { shm_hold(reg->ushm_obj); finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); } } umtx_shm_unref_reg(reg, false); return (error); } static int __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops __unused) { return (umtx_shm(td, uap->uaddr1, uap->val)); } static int __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *ops) { struct umtx_robust_lists_params rb; int error; if (ops->compat32) { if ((td->td_pflags2 & TDP2_COMPAT32RB) == 0 && (td->td_rb_list != 0 || td->td_rbp_list != 0 || td->td_rb_inact != 0)) return (EBUSY); } else if ((td->td_pflags2 & TDP2_COMPAT32RB) != 0) { return (EBUSY); } bzero(&rb, sizeof(rb)); error = ops->copyin_robust_lists(uap->uaddr1, uap->val, &rb); if (error != 0) return (error); if (ops->compat32) td->td_pflags2 |= TDP2_COMPAT32RB; td->td_rb_list = rb.robust_list_offset; td->td_rbp_list = rb.robust_priv_list_offset; td->td_rb_inact = rb.robust_inact_offset; return (0); } #if defined(__i386__) || defined(__amd64__) /* * Provide the standard 32-bit definitions for x86, since native/compat32 use a * 32-bit time_t there. Other architectures just need the i386 definitions * along with their standard compat32. */ struct timespecx32 { int64_t tv_sec; int32_t tv_nsec; }; struct umtx_timex32 { struct timespecx32 _timeout; uint32_t _flags; uint32_t _clockid; }; #ifndef __i386__ #define timespeci386 timespec32 #define umtx_timei386 umtx_time32 #endif #else /* !__i386__ && !__amd64__ */ /* 32-bit architectures can emulate i386, so define these almost everywhere. */ struct timespeci386 { int32_t tv_sec; int32_t tv_nsec; }; struct umtx_timei386 { struct timespeci386 _timeout; uint32_t _flags; uint32_t _clockid; }; #if defined(__LP64__) #define timespecx32 timespec32 #define umtx_timex32 umtx_time32 #endif #endif static int umtx_copyin_robust_lists32(const void *uaddr, size_t size, struct umtx_robust_lists_params *rbp) { struct umtx_robust_lists_params_compat32 rb32; int error; if (size > sizeof(rb32)) return (EINVAL); bzero(&rb32, sizeof(rb32)); error = copyin(uaddr, &rb32, size); if (error != 0) return (error); CP(rb32, *rbp, robust_list_offset); CP(rb32, *rbp, robust_priv_list_offset); CP(rb32, *rbp, robust_inact_offset); return (0); } #ifndef __i386__ static inline int umtx_copyin_timeouti386(const void *uaddr, struct timespec *tsp) { struct timespeci386 ts32; int error; error = copyin(uaddr, &ts32, sizeof(ts32)); if (error == 0) { if (!timespecvalid_interval(&ts32)) error = EINVAL; else { CP(ts32, *tsp, tv_sec); CP(ts32, *tsp, tv_nsec); } } return (error); } static inline int umtx_copyin_umtx_timei386(const void *uaddr, size_t size, struct _umtx_time *tp) { struct umtx_timei386 t32; int error; t32._clockid = CLOCK_REALTIME; t32._flags = 0; if (size <= sizeof(t32._timeout)) error = copyin(uaddr, &t32._timeout, sizeof(t32._timeout)); else error = copyin(uaddr, &t32, sizeof(t32)); if (error != 0) return (error); if (!timespecvalid_interval(&t32._timeout)) return (EINVAL); TS_CP(t32, *tp, _timeout); CP(t32, *tp, _flags); CP(t32, *tp, _clockid); return (0); } static int umtx_copyout_timeouti386(void *uaddr, size_t sz, struct timespec *tsp) { struct timespeci386 remain32 = { .tv_sec = tsp->tv_sec, .tv_nsec = tsp->tv_nsec, }; /* * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time) * and we're only called if sz >= sizeof(timespec) as supplied in the * copyops. */ KASSERT(sz >= sizeof(remain32), ("umtx_copyops specifies incorrect sizes")); return (copyout(&remain32, uaddr, sizeof(remain32))); } #endif /* !__i386__ */ #if defined(__i386__) || defined(__LP64__) static inline int umtx_copyin_timeoutx32(const void *uaddr, struct timespec *tsp) { struct timespecx32 ts32; int error; error = copyin(uaddr, &ts32, sizeof(ts32)); if (error == 0) { if (!timespecvalid_interval(&ts32)) error = EINVAL; else { CP(ts32, *tsp, tv_sec); CP(ts32, *tsp, tv_nsec); } } return (error); } static inline int umtx_copyin_umtx_timex32(const void *uaddr, size_t size, struct _umtx_time *tp) { struct umtx_timex32 t32; int error; t32._clockid = CLOCK_REALTIME; t32._flags = 0; if (size <= sizeof(t32._timeout)) error = copyin(uaddr, &t32._timeout, sizeof(t32._timeout)); else error = copyin(uaddr, &t32, sizeof(t32)); if (error != 0) return (error); if (!timespecvalid_interval(&t32._timeout)) return (EINVAL); TS_CP(t32, *tp, _timeout); CP(t32, *tp, _flags); CP(t32, *tp, _clockid); return (0); } static int umtx_copyout_timeoutx32(void *uaddr, size_t sz, struct timespec *tsp) { struct timespecx32 remain32 = { .tv_sec = tsp->tv_sec, .tv_nsec = tsp->tv_nsec, }; /* * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time) * and we're only called if sz >= sizeof(timespec) as supplied in the * copyops. */ KASSERT(sz >= sizeof(remain32), ("umtx_copyops specifies incorrect sizes")); return (copyout(&remain32, uaddr, sizeof(remain32))); } #endif /* __i386__ || __LP64__ */ typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap, const struct umtx_copyops *umtx_ops); static const _umtx_op_func op_table[] = { #ifdef COMPAT_FREEBSD10 [UMTX_OP_LOCK] = __umtx_op_lock_umtx, [UMTX_OP_UNLOCK] = __umtx_op_unlock_umtx, #else [UMTX_OP_LOCK] = __umtx_op_unimpl, [UMTX_OP_UNLOCK] = __umtx_op_unimpl, #endif [UMTX_OP_WAIT] = __umtx_op_wait, [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_LOCK] = __umtx_op_lock_umutex, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_uint, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, [UMTX_OP_SHM] = __umtx_op_shm, [UMTX_OP_ROBUST_LISTS] = __umtx_op_robust_lists, }; static const struct umtx_copyops umtx_native_ops = { .copyin_timeout = umtx_copyin_timeout, .copyin_umtx_time = umtx_copyin_umtx_time, .copyin_robust_lists = umtx_copyin_robust_lists, .copyout_timeout = umtx_copyout_timeout, .timespec_sz = sizeof(struct timespec), .umtx_time_sz = sizeof(struct _umtx_time), }; #ifndef __i386__ static const struct umtx_copyops umtx_native_opsi386 = { .copyin_timeout = umtx_copyin_timeouti386, .copyin_umtx_time = umtx_copyin_umtx_timei386, .copyin_robust_lists = umtx_copyin_robust_lists32, .copyout_timeout = umtx_copyout_timeouti386, .timespec_sz = sizeof(struct timespeci386), .umtx_time_sz = sizeof(struct umtx_timei386), .compat32 = true, }; #endif #if defined(__i386__) || defined(__LP64__) /* i386 can emulate other 32-bit archs, too! */ static const struct umtx_copyops umtx_native_opsx32 = { .copyin_timeout = umtx_copyin_timeoutx32, .copyin_umtx_time = umtx_copyin_umtx_timex32, .copyin_robust_lists = umtx_copyin_robust_lists32, .copyout_timeout = umtx_copyout_timeoutx32, .timespec_sz = sizeof(struct timespecx32), .umtx_time_sz = sizeof(struct umtx_timex32), .compat32 = true, }; #ifdef COMPAT_FREEBSD32 #ifdef __amd64__ #define umtx_native_ops32 umtx_native_opsi386 #else #define umtx_native_ops32 umtx_native_opsx32 #endif #endif /* COMPAT_FREEBSD32 */ #endif /* __i386__ || __LP64__ */ #define UMTX_OP__FLAGS (UMTX_OP__32BIT | UMTX_OP__I386) static int kern__umtx_op(struct thread *td, void *obj, int op, unsigned long val, void *uaddr1, void *uaddr2, const struct umtx_copyops *ops) { struct _umtx_op_args uap = { .obj = obj, .op = op & ~UMTX_OP__FLAGS, .val = val, .uaddr1 = uaddr1, .uaddr2 = uaddr2 }; if ((uap.op >= nitems(op_table))) return (EINVAL); return ((*op_table[uap.op])(td, &uap, ops)); } int sys__umtx_op(struct thread *td, struct _umtx_op_args *uap) { static const struct umtx_copyops *umtx_ops; umtx_ops = &umtx_native_ops; #ifdef __LP64__ if ((uap->op & (UMTX_OP__32BIT | UMTX_OP__I386)) != 0) { if ((uap->op & UMTX_OP__I386) != 0) umtx_ops = &umtx_native_opsi386; else umtx_ops = &umtx_native_opsx32; } #elif !defined(__i386__) /* We consider UMTX_OP__32BIT a nop on !i386 ILP32. */ if ((uap->op & UMTX_OP__I386) != 0) umtx_ops = &umtx_native_opsi386; #else /* Likewise, UMTX_OP__I386 is a nop on i386. */ if ((uap->op & UMTX_OP__32BIT) != 0) umtx_ops = &umtx_native_opsx32; #endif return (kern__umtx_op(td, uap->obj, uap->op, uap->val, uap->uaddr1, uap->uaddr2, umtx_ops)); } #ifdef COMPAT_FREEBSD32 #ifdef COMPAT_FREEBSD10 int freebsd10_freebsd32_umtx_lock(struct thread *td, struct freebsd10_freebsd32_umtx_lock_args *uap) { return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL)); } int freebsd10_freebsd32_umtx_unlock(struct thread *td, struct freebsd10_freebsd32_umtx_unlock_args *uap) { return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid)); } #endif /* COMPAT_FREEBSD10 */ int freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap) { return (kern__umtx_op(td, uap->obj, uap->op, uap->val, uap->uaddr, uap->uaddr2, &umtx_native_ops32)); } #endif /* COMPAT_FREEBSD32 */ void umtx_thread_init(struct thread *td) { td->td_umtxq = umtxq_alloc(); td->td_umtxq->uq_thread = td; } void umtx_thread_fini(struct thread *td) { umtxq_free(td->td_umtxq); } /* * It will be called when new thread is created, e.g fork(). */ void umtx_thread_alloc(struct thread *td) { struct umtx_q *uq; uq = td->td_umtxq; uq->uq_inherited_pri = PRI_MAX; KASSERT(uq->uq_flags == 0, ("uq_flags != 0")); KASSERT(uq->uq_thread == td, ("uq_thread != td")); KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL")); KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty")); } /* * exec() hook. * * Clear robust lists for all process' threads, not delaying the * cleanup to thread exit, since the relevant address space is * destroyed right now. */ void umtx_exec(struct proc *p) { struct thread *td; KASSERT(p == curproc, ("need curproc")); KASSERT((p->p_flag & P_HADTHREADS) == 0 || (p->p_flag & P_STOPPED_SINGLE) != 0, ("curproc must be single-threaded")); /* * There is no need to lock the list as only this thread can be * running. */ FOREACH_THREAD_IN_PROC(p, td) { KASSERT(td == curthread || ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)), ("running thread %p %p", p, td)); umtx_thread_cleanup(td); td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0; } } /* * thread exit hook. */ void umtx_thread_exit(struct thread *td) { umtx_thread_cleanup(td); } static int umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res, bool compat32) { u_long res1; uint32_t res32; int error; if (compat32) { error = fueword32((void *)ptr, &res32); if (error == 0) res1 = res32; } else { error = fueword((void *)ptr, &res1); } if (error == 0) *res = res1; else error = EFAULT; return (error); } static void umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list, bool compat32) { struct umutex32 m32; if (compat32) { memcpy(&m32, m, sizeof(m32)); *rb_list = m32.m_rb_lnk; } else { *rb_list = m->m_rb_lnk; } } static int umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact, bool compat32) { struct umutex m; int error; KASSERT(td->td_proc == curproc, ("need current vmspace")); error = copyin((void *)rbp, &m, sizeof(m)); if (error != 0) return (error); if (rb_list != NULL) umtx_read_rb_list(td, &m, rb_list, compat32); if ((m.m_flags & UMUTEX_ROBUST) == 0) return (EINVAL); if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid) /* inact is cleared after unlock, allow the inconsistency */ return (inact ? 0 : EINVAL); return (do_unlock_umutex(td, (struct umutex *)rbp, true)); } static void umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact, const char *name, bool compat32) { int error, i; uintptr_t rbp; bool inact; if (rb_list == 0) return; error = umtx_read_uptr(td, rb_list, &rbp, compat32); for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) { if (rbp == *rb_inact) { inact = true; *rb_inact = 0; } else inact = false; error = umtx_handle_rb(td, rbp, &rbp, inact, compat32); } if (i == umtx_max_rb && umtx_verbose_rb) { uprintf("comm %s pid %d: reached umtx %smax rb %d\n", td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb); } if (error != 0 && umtx_verbose_rb) { uprintf("comm %s pid %d: handling %srb error %d\n", td->td_proc->p_comm, td->td_proc->p_pid, name, error); } } /* * Clean up umtx data. */ static void umtx_thread_cleanup(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; uintptr_t rb_inact; bool compat32; /* * Disown pi mutexes. */ uq = td->td_umtxq; if (uq != NULL) { if (uq->uq_inherited_pri != PRI_MAX || !TAILQ_EMPTY(&uq->uq_pi_contested)) { mtx_lock(&umtx_lock); uq->uq_inherited_pri = PRI_MAX; while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { pi->pi_owner = NULL; TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); } mtx_unlock(&umtx_lock); } sched_lend_user_prio_cond(td, PRI_MAX); } compat32 = (td->td_pflags2 & TDP2_COMPAT32RB) != 0; td->td_pflags2 &= ~TDP2_COMPAT32RB; if (td->td_rb_inact == 0 && td->td_rb_list == 0 && td->td_rbp_list == 0) return; /* * Handle terminated robust mutexes. Must be done after * robust pi disown, otherwise unlock could see unowned * entries. */ rb_inact = td->td_rb_inact; if (rb_inact != 0) (void)umtx_read_uptr(td, rb_inact, &rb_inact, compat32); umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "", compat32); umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ", compat32); if (rb_inact != 0) (void)umtx_handle_rb(td, rb_inact, NULL, true, compat32); } diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c index c1f51dd60a7a..0949414bbf00 100644 --- a/sys/kern/p1003_1b.c +++ b/sys/kern/p1003_1b.c @@ -1,384 +1,383 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1996, 1997, 1998 * HD Associates, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by HD Associates, Inc * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* p1003_1b: Real Time common code. */ #include __FBSDID("$FreeBSD$"); #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include -#include #include #include MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B"); /* The system calls return ENOSYS if an entry is called that is not run-time * supported. I am also logging since some programs start to use this when * they shouldn't. That will be removed if annoying. */ int syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap) { log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n", td->td_name, td->td_proc->p_pid, s); /* a " return nosys(p, uap); " here causes a core dump. */ return ENOSYS; } #if !defined(_KPOSIX_PRIORITY_SCHEDULING) /* Not configured but loadable via a module: */ static int sched_attach(void) { return 0; } SYSCALL_NOT_PRESENT_GEN(sched_setparam) SYSCALL_NOT_PRESENT_GEN(sched_getparam) SYSCALL_NOT_PRESENT_GEN(sched_setscheduler) SYSCALL_NOT_PRESENT_GEN(sched_getscheduler) SYSCALL_NOT_PRESENT_GEN(sched_yield) SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max) SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min) SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval) #else /* Configured in kernel version: */ static struct ksched *ksched; static int sched_attach(void) { int ret = ksched_attach(&ksched); if (ret == 0) p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 200112L); return ret; } int sys_sched_setparam(struct thread *td, struct sched_setparam_args *uap) { struct thread *targettd; struct proc *targetp; int e; struct sched_param sched_param; e = copyin(uap->param, &sched_param, sizeof(sched_param)); if (e) return (e); if (uap->pid == 0) { targetp = td->td_proc; targettd = td; PROC_LOCK(targetp); } else { targetp = pfind(uap->pid); if (targetp == NULL) return (ESRCH); targettd = FIRST_THREAD_IN_PROC(targetp); } e = kern_sched_setparam(td, targettd, &sched_param); PROC_UNLOCK(targetp); return (e); } int kern_sched_setparam(struct thread *td, struct thread *targettd, struct sched_param *param) { struct proc *targetp; int error; targetp = targettd->td_proc; PROC_LOCK_ASSERT(targetp, MA_OWNED); error = p_cansched(td, targetp); if (error == 0) error = ksched_setparam(ksched, targettd, (const struct sched_param *)param); return (error); } int sys_sched_getparam(struct thread *td, struct sched_getparam_args *uap) { int e; struct sched_param sched_param; struct thread *targettd; struct proc *targetp; if (uap->pid == 0) { targetp = td->td_proc; targettd = td; PROC_LOCK(targetp); } else { targetp = pfind(uap->pid); if (targetp == NULL) { return (ESRCH); } targettd = FIRST_THREAD_IN_PROC(targetp); } e = kern_sched_getparam(td, targettd, &sched_param); PROC_UNLOCK(targetp); if (e == 0) e = copyout(&sched_param, uap->param, sizeof(sched_param)); return (e); } int kern_sched_getparam(struct thread *td, struct thread *targettd, struct sched_param *param) { struct proc *targetp; int error; targetp = targettd->td_proc; PROC_LOCK_ASSERT(targetp, MA_OWNED); error = p_cansee(td, targetp); if (error == 0) error = ksched_getparam(ksched, targettd, param); return (error); } int sys_sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap) { int e; struct sched_param sched_param; struct thread *targettd; struct proc *targetp; e = copyin(uap->param, &sched_param, sizeof(sched_param)); if (e) return (e); if (uap->pid == 0) { targetp = td->td_proc; targettd = td; PROC_LOCK(targetp); } else { targetp = pfind(uap->pid); if (targetp == NULL) return (ESRCH); targettd = FIRST_THREAD_IN_PROC(targetp); } e = kern_sched_setscheduler(td, targettd, uap->policy, &sched_param); PROC_UNLOCK(targetp); return (e); } int kern_sched_setscheduler(struct thread *td, struct thread *targettd, int policy, struct sched_param *param) { struct proc *targetp; int error; targetp = targettd->td_proc; PROC_LOCK_ASSERT(targetp, MA_OWNED); /* Don't allow non root user to set a scheduler policy. */ error = priv_check(td, PRIV_SCHED_SET); if (error) return (error); error = p_cansched(td, targetp); if (error == 0) error = ksched_setscheduler(ksched, targettd, policy, (const struct sched_param *)param); return (error); } int sys_sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap) { int e, policy; struct thread *targettd; struct proc *targetp; if (uap->pid == 0) { targetp = td->td_proc; targettd = td; PROC_LOCK(targetp); } else { targetp = pfind(uap->pid); if (targetp == NULL) return (ESRCH); targettd = FIRST_THREAD_IN_PROC(targetp); } e = kern_sched_getscheduler(td, targettd, &policy); PROC_UNLOCK(targetp); if (e == 0) td->td_retval[0] = policy; return (e); } int kern_sched_getscheduler(struct thread *td, struct thread *targettd, int *policy) { struct proc *targetp; int error; targetp = targettd->td_proc; PROC_LOCK_ASSERT(targetp, MA_OWNED); error = p_cansee(td, targetp); if (error == 0) error = ksched_getscheduler(ksched, targettd, policy); return (error); } int sys_sched_yield(struct thread *td, struct sched_yield_args *uap) { sched_relinquish(td); return (0); } int sys_sched_get_priority_max(struct thread *td, struct sched_get_priority_max_args *uap) { int error, prio; error = ksched_get_priority_max(ksched, uap->policy, &prio); td->td_retval[0] = prio; return (error); } int sys_sched_get_priority_min(struct thread *td, struct sched_get_priority_min_args *uap) { int error, prio; error = ksched_get_priority_min(ksched, uap->policy, &prio); td->td_retval[0] = prio; return (error); } int sys_sched_rr_get_interval(struct thread *td, struct sched_rr_get_interval_args *uap) { struct timespec timespec; int error; error = kern_sched_rr_get_interval(td, uap->pid, ×pec); if (error == 0) error = copyout(×pec, uap->interval, sizeof(timespec)); return (error); } int kern_sched_rr_get_interval(struct thread *td, pid_t pid, struct timespec *ts) { int e; struct thread *targettd; struct proc *targetp; if (pid == 0) { targettd = td; targetp = td->td_proc; PROC_LOCK(targetp); } else { targetp = pfind(pid); if (targetp == NULL) return (ESRCH); targettd = FIRST_THREAD_IN_PROC(targetp); } e = kern_sched_rr_get_interval_td(td, targettd, ts); PROC_UNLOCK(targetp); return (e); } int kern_sched_rr_get_interval_td(struct thread *td, struct thread *targettd, struct timespec *ts) { struct proc *p; int error; p = targettd->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); error = p_cansee(td, p); if (error == 0) error = ksched_rr_get_interval(ksched, targettd, ts); return (error); } #endif static void p31binit(void *notused) { (void) sched_attach(); p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE); } SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL); diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c index dbbf35cf1a12..eaf2f334ebc1 100644 --- a/sys/kern/tty_pts.c +++ b/sys/kern/tty_pts.c @@ -1,877 +1,876 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* Add compatibility bits for FreeBSD. */ #define PTS_COMPAT /* Add pty(4) compat bits. */ #define PTS_EXTERNAL /* Add bits to make Linux binaries work. */ #define PTS_LINUX #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include /* * Our utmp(5) format is limited to 8-byte TTY line names. This means * we can at most allocate 1000 pseudo-terminals ("pts/999"). Allow * users to increase this number, assuming they have manually increased * UT_LINESIZE. */ static struct unrhdr *pts_pool; static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device"); /* * Per-PTS structure. * * List of locks * (t) locked by tty_lock() * (c) const until freeing */ struct pts_softc { int pts_unit; /* (c) Device unit number. */ unsigned int pts_flags; /* (t) Device flags. */ #define PTS_PKT 0x1 /* Packet mode. */ #define PTS_FINISHED 0x2 /* Return errors on read()/write(). */ char pts_pkt; /* (t) Unread packet mode data. */ struct cv pts_inwait; /* (t) Blocking write() on master. */ struct selinfo pts_inpoll; /* (t) Select queue for write(). */ struct cv pts_outwait; /* (t) Blocking read() on master. */ struct selinfo pts_outpoll; /* (t) Select queue for read(). */ #ifdef PTS_EXTERNAL struct cdev *pts_cdev; /* (c) Master device node. */ #endif /* PTS_EXTERNAL */ struct ucred *pts_cred; /* (c) Resource limit. */ }; /* * Controller-side file operations. */ static int ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; char pkt; if (uio->uio_resid == 0) return (0); tty_lock(tp); for (;;) { /* * Implement packet mode. When packet mode is turned on, * the first byte contains a bitmask of events that * occurred (start, stop, flush, window size, etc). */ if (psc->pts_flags & PTS_PKT && psc->pts_pkt) { pkt = psc->pts_pkt; psc->pts_pkt = 0; tty_unlock(tp); error = ureadc(pkt, uio); return (error); } /* * Transmit regular data. * * XXX: We shouldn't use ttydisc_getc_poll()! Even * though in this implementation, there is likely going * to be data, we should just call ttydisc_getc_uio() * and use its return value to sleep. */ if (ttydisc_getc_poll(tp)) { if (psc->pts_flags & PTS_PKT) { /* * XXX: Small race. Fortunately PTY * consumers aren't multithreaded. */ tty_unlock(tp); error = ureadc(TIOCPKT_DATA, uio); if (error) return (error); tty_lock(tp); } error = ttydisc_getc_uio(tp, uio); break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) break; /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; break; } error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx); if (error != 0) break; } tty_unlock(tp); return (error); } static int ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); char ib[256], *ibstart; size_t iblen, rintlen; int error = 0; if (uio->uio_resid == 0) return (0); for (;;) { ibstart = ib; iblen = MIN(uio->uio_resid, sizeof ib); error = uiomove(ib, iblen, uio); tty_lock(tp); if (error != 0) { iblen = 0; goto done; } /* * When possible, avoid the slow path. rint_bypass() * copies all input to the input queue at once. */ MPASS(iblen > 0); do { rintlen = ttydisc_rint_simple(tp, ibstart, iblen); ibstart += rintlen; iblen -= rintlen; if (iblen == 0) { /* All data written. */ break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) { error = EIO; goto done; } /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; goto done; } /* Wake up users on the slave side. */ ttydisc_rint_done(tp); error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx); if (error != 0) goto done; } while (iblen > 0); if (uio->uio_resid == 0) break; tty_unlock(tp); } done: ttydisc_rint_done(tp); tty_unlock(tp); /* * Don't account for the part of the buffer that we couldn't * pass to the TTY. */ uio->uio_resid += iblen; return (error); } static int ptsdev_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0, sig; switch (cmd) { case FIODTYPE: *(int *)data = D_TTY; return (0); case FIONBIO: /* This device supports non-blocking operation. */ return (0); case FIONREAD: tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Force read() to be called. */ *(int *)data = 1; } else { *(int *)data = ttydisc_getc_poll(tp); } tty_unlock(tp); return (0); case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif { struct fiodgname_arg *fgn; const char *p; int i; /* Reverse device name lookups, for ptsname() and ttyname(). */ fgn = data; p = tty_devname(tp); i = strlen(p) + 1; if (i > fgn->len) return (EINVAL); return (copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i)); } /* * We need to implement TIOCGPGRP and TIOCGSID here again. When * called on the pseudo-terminal master, it should not check if * the terminal is the foreground terminal of the calling * process. * * TIOCGETA is also implemented here. Various Linux PTY routines * often call isatty(), which is implemented by tcgetattr(). */ #ifdef PTS_LINUX case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ tty_lock(tp); *(struct termios*)data = tp->t_termios; tty_unlock(tp); return (0); #endif /* PTS_LINUX */ case TIOCSETAF: case TIOCSETAW: /* * We must make sure we turn tcsetattr() calls of TCSAFLUSH and * TCSADRAIN into something different. If an application would * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may * deadlock waiting for all data to be read. */ cmd = TIOCSETA; break; #if defined(PTS_COMPAT) || defined(PTS_LINUX) case TIOCGPTN: /* * Get the device unit number. */ if (psc->pts_unit < 0) return (ENOTTY); *(unsigned int *)data = psc->pts_unit; return (0); #endif /* PTS_COMPAT || PTS_LINUX */ case TIOCGPGRP: /* Get the foreground process group ID. */ tty_lock(tp); if (tp->t_pgrp != NULL) *(int *)data = tp->t_pgrp->pg_id; else *(int *)data = NO_PID; tty_unlock(tp); return (0); case TIOCGSID: /* Get the session leader process ID. */ tty_lock(tp); if (tp->t_session == NULL) error = ENOTTY; else *(int *)data = tp->t_session->s_sid; tty_unlock(tp); return (error); case TIOCPTMASTER: /* Yes, we are a pseudo-terminal master. */ return (0); case TIOCSIG: /* Signal the foreground process group. */ sig = *(int *)data; if (sig < 1 || sig >= NSIG) return (EINVAL); tty_lock(tp); tty_signal_pgrp(tp, sig); tty_unlock(tp); return (0); case TIOCPKT: /* Enable/disable packet mode. */ tty_lock(tp); if (*(int *)data) psc->pts_flags |= PTS_PKT; else psc->pts_flags &= ~PTS_PKT; tty_unlock(tp); return (0); } /* Just redirect this ioctl to the slave device. */ tty_lock(tp); error = tty_ioctl(tp, cmd, data, fp->f_flag, td); tty_unlock(tp); if (error == ENOIOCTL) error = ENOTTY; return (error); } static int ptsdev_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int revents = 0; tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Slave device is not opened. */ tty_unlock(tp); return ((events & (POLLIN|POLLRDNORM)) | POLLHUP); } if (events & (POLLIN|POLLRDNORM)) { /* See if we can getc something. */ if (ttydisc_getc_poll(tp) || (psc->pts_flags & PTS_PKT && psc->pts_pkt)) revents |= events & (POLLIN|POLLRDNORM); } if (events & (POLLOUT|POLLWRNORM)) { /* See if we can rint something. */ if (ttydisc_rint_poll(tp)) revents |= events & (POLLOUT|POLLWRNORM); } /* * No need to check for POLLHUP here. This device cannot be used * as a callout device, which means we always have a carrier, * because the master is. */ if (revents == 0) { /* * This code might look misleading, but the naming of * poll events on this side is the opposite of the slave * device. */ if (events & (POLLIN|POLLRDNORM)) selrecord(td, &psc->pts_outpoll); if (events & (POLLOUT|POLLWRNORM)) selrecord(td, &psc->pts_inpoll); } tty_unlock(tp); return (revents); } /* * kqueue support. */ static void pts_kqops_read_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_outpoll.si_note, kn, 0); } static int pts_kqops_read_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_getc_poll(tp); return (kn->kn_data > 0); } } static void pts_kqops_write_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_inpoll.si_note, kn, 0); } static int pts_kqops_write_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_rint_poll(tp); return (kn->kn_data > 0); } } static struct filterops pts_kqops_read = { .f_isfd = 1, .f_detach = pts_kqops_read_detach, .f_event = pts_kqops_read_event, }; static struct filterops pts_kqops_write = { .f_isfd = 1, .f_detach = pts_kqops_write_detach, .f_event = pts_kqops_write_event, }; static int ptsdev_kqfilter(struct file *fp, struct knote *kn) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; tty_lock(tp); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pts_kqops_read; knlist_add(&psc->pts_outpoll.si_note, kn, 1); break; case EVFILT_WRITE: kn->kn_fop = &pts_kqops_write; knlist_add(&psc->pts_inpoll.si_note, kn, 1); break; default: error = EINVAL; break; } tty_unlock(tp); return (error); } static int ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; #ifdef PTS_EXTERNAL struct pts_softc *psc = tty_softc(tp); #endif /* PTS_EXTERNAL */ struct cdev *dev = tp->t_dev; /* * According to POSIX, we must implement an fstat(). This also * makes this implementation compatible with Linux binaries, * because Linux calls fstat() on the pseudo-terminal master to * obtain st_rdev. * * XXX: POSIX also mentions we must fill in st_dev, but how? */ bzero(sb, sizeof *sb); #ifdef PTS_EXTERNAL if (psc->pts_cdev != NULL) sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev); else #endif /* PTS_EXTERNAL */ sb->st_ino = sb->st_rdev = tty_udev(tp); sb->st_atim = dev->si_atime; sb->st_ctim = dev->si_ctime; sb->st_mtim = dev->si_mtime; sb->st_uid = dev->si_uid; sb->st_gid = dev->si_gid; sb->st_mode = dev->si_mode | S_IFCHR; return (0); } static int ptsdev_close(struct file *fp, struct thread *td) { struct tty *tp = fp->f_data; /* Deallocate TTY device. */ tty_lock(tp); tty_rel_gone(tp); /* * Open of /dev/ptmx or /dev/ptyXX changes the type of file * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode * use count, we need to decrement it, and possibly do other * required cleanup. */ if (fp->f_vnode != NULL) return (vnops.fo_close(fp, td)); return (0); } static int ptsdev_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct tty *tp; kif->kf_type = KF_TYPE_PTS; tp = fp->f_data; kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp); kif->kf_un.kf_pts.kf_pts_dev_freebsd11 = kif->kf_un.kf_pts.kf_pts_dev; /* truncate */ strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path)); return (0); } static struct fileops ptsdev_ops = { .fo_read = ptsdev_read, .fo_write = ptsdev_write, .fo_truncate = invfo_truncate, .fo_ioctl = ptsdev_ioctl, .fo_poll = ptsdev_poll, .fo_kqfilter = ptsdev_kqfilter, .fo_stat = ptsdev_stat, .fo_close = ptsdev_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ptsdev_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; /* * Driver-side hooks. */ static void ptsdrv_outwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_outwait); selwakeup(&psc->pts_outpoll); KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0); } static void ptsdrv_inwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_inwait); selwakeup(&psc->pts_inpoll); KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0); } static int ptsdrv_open(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); psc->pts_flags &= ~PTS_FINISHED; return (0); } static void ptsdrv_close(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); /* Wake up any blocked readers/writers. */ psc->pts_flags |= PTS_FINISHED; ptsdrv_outwakeup(tp); ptsdrv_inwakeup(tp); } static void ptsdrv_pktnotify(struct tty *tp, char event) { struct pts_softc *psc = tty_softc(tp); /* * Clear conflicting flags. */ switch (event) { case TIOCPKT_STOP: psc->pts_pkt &= ~TIOCPKT_START; break; case TIOCPKT_START: psc->pts_pkt &= ~TIOCPKT_STOP; break; case TIOCPKT_NOSTOP: psc->pts_pkt &= ~TIOCPKT_DOSTOP; break; case TIOCPKT_DOSTOP: psc->pts_pkt &= ~TIOCPKT_NOSTOP; break; } psc->pts_pkt |= event; ptsdrv_outwakeup(tp); } static void ptsdrv_free(void *softc) { struct pts_softc *psc = softc; /* Make device number available again. */ if (psc->pts_unit >= 0) free_unr(pts_pool, psc->pts_unit); chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0); racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1); crfree(psc->pts_cred); seldrain(&psc->pts_inpoll); seldrain(&psc->pts_outpoll); knlist_destroy(&psc->pts_inpoll.si_note); knlist_destroy(&psc->pts_outpoll.si_note); #ifdef PTS_EXTERNAL /* Destroy master device as well. */ if (psc->pts_cdev != NULL) destroy_dev_sched(psc->pts_cdev); #endif /* PTS_EXTERNAL */ free(psc, M_PTS); } static struct ttydevsw pts_class = { .tsw_flags = TF_NOPREFIX, .tsw_outwakeup = ptsdrv_outwakeup, .tsw_inwakeup = ptsdrv_inwakeup, .tsw_open = ptsdrv_open, .tsw_close = ptsdrv_close, .tsw_pktnotify = ptsdrv_pktnotify, .tsw_free = ptsdrv_free, }; #ifndef PTS_EXTERNAL static #endif /* !PTS_EXTERNAL */ int pts_alloc(int fflags, struct thread *td, struct file *fp) { int unit, ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); error = racct_add(p, RACCT_NPTS, 1); if (error != 0) { PROC_UNLOCK(p); return (EAGAIN); } ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS)); if (!ok) { racct_sub(p, RACCT_NPTS, 1); PROC_UNLOCK(p); return (EAGAIN); } PROC_UNLOCK(p); /* Try to allocate a new pts unit number. */ unit = alloc_unr(pts_pool); if (unit < 0) { racct_sub(p, RACCT_NPTS, 1); chgptscnt(cred->cr_ruidinfo, -1, 0); return (EAGAIN); } /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "ptsin"); cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = unit; psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #ifdef PTS_EXTERNAL int pts_alloc_external(int fflags, struct thread *td, struct file *fp, struct cdev *dev, const char *name) { int ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); error = racct_add(p, RACCT_NPTS, 1); if (error != 0) { PROC_UNLOCK(p); return (EAGAIN); } ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS)); if (!ok) { racct_sub(p, RACCT_NPTS, 1); PROC_UNLOCK(p); return (EAGAIN); } PROC_UNLOCK(p); /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "ptsin"); cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = -1; psc->pts_cdev = dev; psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "%s", name); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #endif /* PTS_EXTERNAL */ int sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap) { int error, fd; struct file *fp; /* * POSIX states it's unspecified when other flags are passed. We * don't allow this. */ if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC)) return (EINVAL); error = falloc(td, &fp, &fd, uap->flags); if (error) return (error); /* Allocate the actual pseudo-TTY. */ error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp); if (error != 0) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } /* Pass it back to userspace. */ td->td_retval[0] = fd; fdrop(fp, td); return (0); } static void pts_init(void *unused) { pts_pool = new_unrhdr(0, INT_MAX, NULL); } SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL); diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 44079bae1e9b..a5f27f6d3148 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -1,1624 +1,1626 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#ifdef COMPAT_43 #include +#endif #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); static int sockargs(struct mbuf **, char *, socklen_t, int); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. * If required copy of current set of capability rights is returned. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp, struct filecaps *havecapsp) { struct file *fp; int error; error = fget_cap(td, fd, rightsp, &fp, havecapsp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); if (havecapsp != NULL) filecaps_free(havecapsp); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(struct thread *td, struct socket_args *uap) { return (kern_socket(td, uap->domain, uap->type, uap->protocol)); } int kern_socket(struct thread *td, int domain, int type, int protocol) { struct socket *so; struct file *fp; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(domain, &so, type, protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } int sys_bind(struct thread *td, struct bind_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) return (ECAPMODE); #endif AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, &cap_bind_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } int sys_bindat(struct thread *td, struct bindat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int sys_listen(struct thread *td, struct listen_args *uap) { return (kern_listen(td, uap->s, uap->backlog)); } int kern_listen(struct thread *td, int s, int backlog) { struct socket *so; struct file *fp; int error; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_listen_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, backlog, td); fdrop(fp, td); } return (error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && (flags & ACCEPT4_COMPAT) != 0) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; struct filecaps fcaps; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_accept_rights, &headfp, &fflag, &fcaps); if (error != 0) return (error); head = headfp->f_data; if (!SOLISTENING(head)) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif error = falloc_caps(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; SOCK_LOCK(head); if (!SOLISTENING(head)) { SOCK_UNLOCK(head); error = EINVAL; goto noconnection; } error = solisten_dequeue(head, &so, flags); if (error != 0) goto noconnection; /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* Connection has been removed from the listen queue. */ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (nfp == NULL) filecaps_free(&fcaps); if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(struct thread *td, struct oaccept_args *uap) { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ int sys_connect(struct thread *td, struct connect_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) return (ECAPMODE); #endif AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, &cap_connect_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) break; } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int sys_connectat(struct thread *td, struct connectat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } else if (so1->so_proto->pr_flags & PR_CONNREQUIRED) { struct unpcb *unp, *unp2; unp = sotounpcb(so1); unp2 = sotounpcb(so2); /* * No need to lock the unps, because the sockets are brand-new. * No other threads can be using them yet */ unp_copy_peercred(td, unp, unp2, unp); } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(struct thread *td, int s, struct msghdr *mp, int flags) { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && (mp->msg_flags != MSG_COMPAT || !SV_PROC_FLAG(td->td_proc, SV_AOUT)) #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT && SV_PROC_FLAG(td->td_proc, SV_AOUT)) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t *rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); rights = &cap_send_rights; if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); rights = &cap_send_connect_rights; } error = getsock_cap(td, s, rights, &fp, NULL, NULL); if (error != 0) { m_freem(control); return (error); } so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) { m_freem(control); goto bad; } } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) { m_freem(control); goto bad; } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; m_freem(control); goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(struct thread *td, struct sendto_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = __DECONST(void *, uap->to); msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags = 0; #endif aiov.iov_base = __DECONST(void *, uap->buf); aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(struct thread *td, struct osend_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = __DECONST(void *, uap->buf); aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(struct thread *td, struct osendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(struct thread *td, struct sendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, struct mbuf **controlp) { struct uio auio; struct iovec *iov; struct mbuf *control, *m; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_recv_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif control = NULL; len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if ((mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && (mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif ctlbuf = mp->msg_control; len = mp->msg_controllen; mp->msg_controllen = 0; for (m = control; m != NULL && len >= m->m_len; m = m->m_next) { if ((error = copyout(mtod(m, caddr_t), ctlbuf, m->m_len)) != 0) goto out; ctlbuf += m->m_len; len -= m->m_len; mp->msg_controllen += m->m_len; } if (m != NULL) { mp->msg_flags |= MSG_CTRUNC; m_dispose_extcontrolm(m); } } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control != NULL) { if (error != 0) m_dispose_extcontrolm(control); m_freem(control); } return (error); } static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp) { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if ((mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(struct thread *td, struct recvfrom_args *uap) { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(struct thread *td, struct recvfrom_args *uap) { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(struct thread *td, struct orecv_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(struct thread *td, struct orecvmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(struct thread *td, struct recvmsg_args *uap) { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } int sys_shutdown(struct thread *td, struct shutdown_args *uap) { return (kern_shutdown(td, uap->s, uap->how)); } int kern_shutdown(struct thread *td, int s, int how) { struct socket *so; struct file *fp; int error; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_shutdown_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } int sys_setsockopt(struct thread *td, struct setsockopt_args *uap) { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(struct thread *td, int s, int level, int name, const void *val, enum uio_seg valseg, socklen_t valsize) { struct socket *so; struct file *fp; struct sockopt sopt; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = __DECONST(void *, val); sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_setsockopt_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } int sys_getsockopt(struct thread *td, struct getsockopt_args *uap) { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t *valsize) { struct socket *so; struct file *fp; struct sockopt sopt; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_getsockopt_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, &cap_getsockname_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, &cap_getpeername_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(struct thread *td, struct getpeername_args *uap) { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(struct thread *td, struct ogetpeername_args *uap) { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ static int sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112 && SV_CURPROC_FLAG(SV_AOUT)) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, void *), buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX && SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len) { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX && SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } /* * Dispose of externalized rights from an SCM_RIGHTS message. This function * should be used in error or truncation cases to avoid leaking file descriptors * into the recipient's (the current thread's) table. */ void m_dispose_extcontrolm(struct mbuf *m) { struct cmsghdr *cm; struct file *fp; struct thread *td; socklen_t clen, datalen; int error, fd, *fds, nfd; td = curthread; for (; m != NULL; m = m->m_next) { if (m->m_type != MT_EXTCONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (clen > 0) { if (clen < sizeof(*cm)) panic("%s: truncated mbuf %p", __func__, m); datalen = CMSG_SPACE(cm->cmsg_len - CMSG_SPACE(0)); if (clen < datalen) panic("%s: truncated mbuf %p", __func__, m); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { fds = (int *)CMSG_DATA(cm); nfd = (cm->cmsg_len - CMSG_SPACE(0)) / sizeof(int); while (nfd-- > 0) { fd = *fds++; error = fget(td, fd, &cap_no_rights, &fp); if (error == 0) { fdclose(td, fp, fd); fdrop(fp, td); } } } clen -= datalen; cm = (struct cmsghdr *)((uint8_t *)cm + datalen); } m_chtype(m, MT_CONTROL); } } diff --git a/sys/kern/vfs_acl.c b/sys/kern/vfs_acl.c index 6bae01bd9d85..596a26f8acea 100644 --- a/sys/kern/vfs_acl.c +++ b/sys/kern/vfs_acl.c @@ -1,599 +1,598 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999-2006, 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Developed by the TrustedBSD Project. * * ACL system calls and other functions common across different ACL types. * Type-specific routines go into subr_acl_.c. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES); MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists"); static int kern___acl_aclcheck_path(struct thread *td, const char *path, acl_type_t type, struct acl *aclp, int follow); static int kern___acl_delete_path(struct thread *td, const char *path, acl_type_t type, int follow); static int kern___acl_get_path(struct thread *td, const char *path, acl_type_t type, struct acl *aclp, int follow); static int kern___acl_set_path(struct thread *td, const char *path, acl_type_t type, const struct acl *aclp, int follow); static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, const struct acl *aclp); static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, const struct acl *aclp); int acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest) { int i; if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES) return (EINVAL); bzero(dest, sizeof(*dest)); dest->acl_cnt = source->acl_cnt; dest->acl_maxcnt = ACL_MAX_ENTRIES; for (i = 0; i < dest->acl_cnt; i++) { dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag; dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id; dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm; } return (0); } int acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest) { int i; if (source->acl_cnt > OLDACL_MAX_ENTRIES) return (EINVAL); bzero(dest, sizeof(*dest)); dest->acl_cnt = source->acl_cnt; for (i = 0; i < dest->acl_cnt; i++) { dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag; dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id; dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm; } return (0); } /* * At one time, "struct ACL" was extended in order to add support for NFSv4 * ACLs. Instead of creating compatibility versions of all the ACL-related * syscalls, they were left intact. It's possible to find out what the code * calling these syscalls (libc) expects basing on "type" argument - if it's * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct * oldacl". If it's something else, then it's the new "struct acl". In the * latter case, the routines below just copyin/copyout the contents. In the * former case, they copyin the "struct oldacl" and convert it to the new * format. */ static int acl_copyin(const void *user_acl, struct acl *kernel_acl, acl_type_t type) { int error; struct oldacl old; switch (type) { case ACL_TYPE_ACCESS_OLD: case ACL_TYPE_DEFAULT_OLD: error = copyin(user_acl, &old, sizeof(old)); if (error != 0) break; acl_copy_oldacl_into_acl(&old, kernel_acl); break; default: error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl)); if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES) return (EINVAL); } return (error); } static int acl_copyout(const struct acl *kernel_acl, void *user_acl, acl_type_t type) { uint32_t am; int error; struct oldacl old; switch (type) { case ACL_TYPE_ACCESS_OLD: case ACL_TYPE_DEFAULT_OLD: error = acl_copy_acl_into_oldacl(kernel_acl, &old); if (error != 0) break; error = copyout(&old, user_acl, sizeof(old)); break; default: error = fueword32((char *)user_acl + offsetof(struct acl, acl_maxcnt), &am); if (error == -1) return (EFAULT); if (am != ACL_MAX_ENTRIES) return (EINVAL); error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl)); } return (error); } /* * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new" * counterpart. It's required for old (pre-NFSv4 ACLs) libc to work * with new kernel. Fixing 'type' for old binaries with new libc * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold(). */ static int acl_type_unold(int type) { switch (type) { case ACL_TYPE_ACCESS_OLD: return (ACL_TYPE_ACCESS); case ACL_TYPE_DEFAULT_OLD: return (ACL_TYPE_DEFAULT); default: return (type); } } /* * These calls wrap the real vnode operations, and are called by the syscall * code once the syscall has converted the path or file descriptor to a vnode * (unlocked). The aclp pointer is assumed still to point to userland, so * this should not be consumed within the kernel except by syscall code. * Other code should directly invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, const struct acl *aclp) { struct acl *inkernelacl; struct mount *mp; int error; AUDIT_ARG_VALUE(type); inkernelacl = acl_alloc(M_WAITOK); error = acl_copyin(aclp, inkernelacl, type); if (error != 0) goto out; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto out; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl); if (error != 0) goto out_unlock; #endif error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); #ifdef MAC out_unlock: #endif VOP_UNLOCK(vp); vn_finished_write(mp); out: acl_free(inkernelacl); return (error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl *inkernelacl; int error; AUDIT_ARG_VALUE(type); inkernelacl = acl_alloc(M_WAITOK | M_ZERO); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_getacl(td->td_ucred, vp, type); if (error != 0) goto out; #endif error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp); if (error == 0) error = acl_copyout(inkernelacl, aclp, type); acl_free(inkernelacl); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) { struct mount *mp; int error; AUDIT_ARG_VALUE(type); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_deleteacl(td->td_ucred, vp, type); if (error != 0) goto out; #endif error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it * * XXXRW: No vnode lock held so can't audit vnode state...? */ static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, const struct acl *aclp) { struct acl *inkernelacl; int error; inkernelacl = acl_alloc(M_WAITOK); error = acl_copyin(aclp, inkernelacl, type); if (error != 0) goto out; error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); out: acl_free(inkernelacl); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't * need to lock, as the vacl_ code will get/release any locks required. */ /* * Given a file path, get an ACL for it */ int sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap) { return (kern___acl_get_path(td, uap->path, uap->type, uap->aclp, FOLLOW)); } /* * Given a file path, get an ACL for it; don't follow links. */ int sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap) { return(kern___acl_get_path(td, uap->path, uap->type, uap->aclp, NOFOLLOW)); } static int kern___acl_get_path(struct thread *td, const char *path, acl_type_t type, struct acl *aclp, int follow) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td); error = namei(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, type, aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file path, set an ACL for it. */ int sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap) { return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp, FOLLOW)); } /* * Given a file path, set an ACL for it; don't follow links. */ int sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap) { return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp, NOFOLLOW)); } static int kern___acl_set_path(struct thread *td, const char *path, acl_type_t type, const struct acl *aclp, int follow) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td); error = namei(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, type, aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file descriptor, get an ACL for it. */ int sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init_one(&rights, CAP_ACL_GET), &fp); if (error == 0) { error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } /* * Given a file descriptor, set an ACL for it. */ int sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init_one(&rights, CAP_ACL_SET), &fp); if (error == 0) { error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } /* * Given a file path, delete an ACL from it. */ int sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) { return (kern___acl_delete_path(td, uap->path, uap->type, FOLLOW)); } /* * Given a file path, delete an ACL from it; don't follow links. */ int sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap) { return (kern___acl_delete_path(td, uap->path, uap->type, NOFOLLOW)); } static int kern___acl_delete_path(struct thread *td, const char *path, acl_type_t type, int follow) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path, td); error = namei(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, type); NDFREE(&nd, 0); } return (error); } /* * Given a file path, delete an ACL from it. */ int sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init_one(&rights, CAP_ACL_DELETE), &fp); if (error == 0) { error = vacl_delete(td, fp->f_vnode, uap->type); fdrop(fp, td); } return (error); } /* * Given a file path, check an ACL for it. */ int sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) { return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp, FOLLOW)); } /* * Given a file path, check an ACL for it; don't follow links. */ int sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap) { return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp, NOFOLLOW)); } static int kern___acl_aclcheck_path(struct thread *td, const char *path, acl_type_t type, struct acl *aclp, int follow) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path, td); error = namei(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, type, aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file descriptor, check an ACL for it. */ int sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init_one(&rights, CAP_ACL_CHECK), &fp); if (error == 0) { error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } struct acl * acl_alloc(int flags) { struct acl *aclp; aclp = malloc(sizeof(*aclp), M_ACL, flags); if (aclp == NULL) return (NULL); aclp->acl_maxcnt = ACL_MAX_ENTRIES; return (aclp); } void acl_free(struct acl *aclp) { free(aclp, M_ACL); } diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 82fa7990937a..32a31ec80cfb 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -1,3164 +1,3164 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */ #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif FEATURE(aio, "Asynchronous I/O"); SYSCTL_DECL(_p1003_1b); static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list"); static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Async IO management"); static int enable_aio_unsafe = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0, "Permit asynchronous IO on all file types, not just known-safe types"); static unsigned int unsafe_warningcnt = 1; SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW, &unsafe_warningcnt, 0, "Warnings that will be triggered upon failed IO requests on unsafe files"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel processes to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel processes for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel processes for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); static int num_unmapped_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio, 0, "Number of aio requests presently handled by unmapped I/O buffers"); /* Number of async I/O processes in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process"); /* * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with * vfs.aio.aio_listio_max. */ SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max, CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc, 0, "Maximum aio requests for a single lio_listio call"); #ifdef COMPAT_FREEBSD6 typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; #endif /* * Below is a key of locks used to protect each member of struct kaiocb * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * If the routine that services an AIO request blocks while running in an * AIO kernel process it can starve other I/O requests. BIO requests * queued via aio_qbio() complete asynchronously and do not use AIO kernel * processes at all. Socket I/O requests use a separate pool of * kprocs and also force non-blocking I/O. Other file I/O requests * use the generic fo_read/fo_write operations which can block. The * fsync and mlock operations can also block while executing. Ideally * none of these requests would block while executing. * * Note that the service routines cannot toggle O_NONBLOCK in the file * structure directly while handling a request due to races with * userland threads. */ /* jobflags */ #define KAIOCB_QUEUEING 0x01 #define KAIOCB_CANCELLED 0x02 #define KAIOCB_CANCELLING 0x04 #define KAIOCB_CHECKSYNC 0x08 #define KAIOCB_CLEARED 0x10 #define KAIOCB_FINISHED 0x20 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aioproc { int aioprocflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ struct proc *aioproc; /* (*) the AIO proc */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) count of jobs */ int lioj_finished_count; /* (a) count of finished jobs */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_count; /* (a) size of AIO queue */ int kaio_buffer_count; /* (a) number of bio buffers */ TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */ TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */ TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */ struct task kaio_task; /* (*) task to kick aio processes */ struct task kaio_sync_task; /* (*) task to schedule fsync jobs */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */ /* * Operations used to interact with userland aio control blocks. * Different ABIs provide their own operations. */ struct aiocb_ops { int (*aio_copyin)(struct aiocb *ujob, struct kaiocb *kjob, int ty); long (*fetch_status)(struct aiocb *ujob); long (*fetch_error)(struct aiocb *ujob); int (*store_status)(struct aiocb *ujob, long status); int (*store_error)(struct aiocb *ujob, long error); int (*store_kernelinfo)(struct aiocb *ujob, long jobref); int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); }; static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; static void aio_biocleanup(struct bio *bp); void aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct kaiocb *job); static void aio_process_rw(struct kaiocb *job); static void aio_process_sync(struct kaiocb *job); static void aio_process_mlock(struct kaiocb *job); static void aio_schedule_fsync(void *context, int pending); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lio, int type, struct aiocb_ops *ops); static int aio_queue_file(struct file *fp, struct kaiocb *job); static void aio_biowakeup(struct bio *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qbio(struct proc *p, struct kaiocb *job); static void aio_daemon(void *param); static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); static bool aio_clear_cancel_function_locked(struct kaiocb *job); static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io process data * aiocb async io jobs * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { .f_isfd = 0, .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, }; static struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, .f_event = filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_kick); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static int aio_onceonly(void) { exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); ki->kaio_flags = 0; ki->kaio_active_count = 0; ki->kaio_count = 0; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_syncqueue); TAILQ_INIT(&ki->kaio_syncready); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi, bool ext) { struct thread *td; int error; error = sigev_findtd(p, sigev, &td); if (error) return (error); if (!KSI_ONQ(ksi)) { ksiginfo_set_sigev(ksi, sigev); ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= ext ? (KSI_EXT | KSI_INS) : 0; tdsendsignal(p, td, ksi->ksi_signo, ksi); } PROC_UNLOCK(p); return (error); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct kaiocb *job) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = job->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(job->jobflags & KAIOCB_FINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, job, plist); TAILQ_REMOVE(&ki->kaio_all, job, allist); lj = job->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* job is going away, we need to destroy any knotes */ knlist_delete(&job->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&job->ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * a kaiocb from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ if (job->fd_file) fdrop(job->fd_file, curthread); crfree(job->cred); if (job->uiop != &job->uio) free(job->uiop, M_IOV); uma_zfree(aiocb_zone, job); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } static int aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job) { aio_cancel_fn_t *func; int cancelled; AIO_LOCK_ASSERT(ki, MA_OWNED); if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED)) return (0); MPASS((job->jobflags & KAIOCB_CANCELLING) == 0); job->jobflags |= KAIOCB_CANCELLED; func = job->cancel_fn; /* * If there is no cancel routine, just leave the job marked as * cancelled. The job should be in active use by a caller who * should complete it normally or when it fails to install a * cancel routine. */ if (func == NULL) return (0); /* * Set the CANCELLING flag so that aio_complete() will defer * completions of this job. This prevents the job from being * freed out from under the cancel callback. After the * callback any deferred completion (whether from the callback * or any other source) will be completed. */ job->jobflags |= KAIOCB_CANCELLING; AIO_UNLOCK(ki); func(job); AIO_LOCK(ki); job->jobflags &= ~KAIOCB_CANCELLING; if (job->jobflags & KAIOCB_FINISHED) { cancelled = job->uaiocb._aiocb_private.error == ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(p, job); } else { /* * The cancel callback might have scheduled an * operation to cancel this request, but it is * only counted as cancelled if the request is * cancelled when the callback returns. */ cancelled = 0; } return (cancelled); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct kaiocb *job, *jobn; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { aio_cancel_job(p, ki, job); } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(job); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct kaiocb * aio_selectjob(struct aioproc *aiop) { struct kaiocb *job; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); restart: TAILQ_FOREACH(job, &aio_jobs, list) { userp = job->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < max_aio_per_proc) { TAILQ_REMOVE(&aio_jobs, job, list); if (!aio_clear_cancel_function(job)) goto restart; /* Account for currently active jobs. */ ki->kaio_active_count++; break; } } return (job); } /* * Move all data to a permanent storage device. This code * simulates the fsync and fdatasync syscalls. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp, int op) { struct mount *mp; vm_object_t obj; int error; for (;;) { error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) break; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); obj = vp->v_object; if (obj != NULL) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } if (op == LIO_DSYNC) error = VOP_FDATASYNC(vp, td); else error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp); vn_finished_write(mp); if (error != ERELOOKUP) break; } return (error); } /* * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that * does the I/O request for the non-bio version of the operations. The normal * vn operations are used, and this code should work in all instances for every * type of file, including pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process_rw(struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct aiocb *cb; struct file *fp; ssize_t cnt; long msgsnd_st, msgsnd_end; long msgrcv_st, msgrcv_end; long oublock_st, oublock_end; long inblock_st, inblock_end; int error, opcode; KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || job->uaiocb.aio_lio_opcode == LIO_READV || job->uaiocb.aio_lio_opcode == LIO_WRITE || job->uaiocb.aio_lio_opcode == LIO_WRITEV, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); td = curthread; td_savedcred = td->td_ucred; td->td_ucred = job->cred; job->uiop->uio_td = td; cb = &job->uaiocb; fp = job->fd_file; opcode = job->uaiocb.aio_lio_opcode; cnt = job->uiop->uio_resid; msgrcv_st = td->td_ru.ru_msgrcv; msgsnd_st = td->td_ru.ru_msgsnd; inblock_st = td->td_ru.ru_inblock; oublock_st = td->td_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (opcode == LIO_READ || opcode == LIO_READV) { if (job->uiop->uio_resid == 0) error = 0; else error = fo_read(fp, job->uiop, fp->f_cred, FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); error = fo_write(fp, job->uiop, fp->f_cred, FOF_OFFSET, td); } msgrcv_end = td->td_ru.ru_msgrcv; msgsnd_end = td->td_ru.ru_msgsnd; inblock_end = td->td_ru.ru_inblock; oublock_end = td->td_ru.ru_oublock; job->msgrcv = msgrcv_end - msgrcv_st; job->msgsnd = msgsnd_end - msgsnd_st; job->inblock = inblock_end - inblock_st; job->outblock = oublock_end - oublock_st; if (error != 0 && job->uiop->uio_resid != cnt) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if (error == EPIPE && (opcode & LIO_WRITE)) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } cnt -= job->uiop->uio_resid; td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, cnt, 0); } static void aio_process_sync(struct kaiocb *job) { struct thread *td = curthread; struct ucred *td_savedcred = td->td_ucred; struct file *fp = job->fd_file; int error = 0; KASSERT(job->uaiocb.aio_lio_opcode & LIO_SYNC, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); td->td_ucred = job->cred; if (fp->f_vnode != NULL) { error = aio_fsync_vnode(td, fp->f_vnode, job->uaiocb.aio_lio_opcode); } td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, 0, 0); } static void aio_process_mlock(struct kaiocb *job) { struct aiocb *cb = &job->uaiocb; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); error = kern_mlock(job->userproc, job->cred, __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes); aio_complete(job, error != 0 ? -1 : 0, error); } static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job) { struct aioliojob *lj; struct kaioinfo *ki; struct kaiocb *sjob, *sjobn; int lj_done; bool schedule_fsync; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = job->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist); MPASS(job->jobflags & KAIOCB_FINISHED); if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi, true); KNOTE_LOCKED(&job->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL | LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi, true); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (job->jobflags & KAIOCB_CHECKSYNC) { schedule_fsync = false; TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) { if (job->fd_file != sjob->fd_file || job->seqno >= sjob->seqno) continue; if (--sjob->pending > 0) continue; TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list); if (!aio_clear_cancel_function_locked(sjob)) continue; TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list); schedule_fsync = true; } if (schedule_fsync) taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_sync_task); } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } static void aio_schedule_fsync(void *context, int pending) { struct kaioinfo *ki; struct kaiocb *job; ki = context; AIO_LOCK(ki); while (!TAILQ_EMPTY(&ki->kaio_syncready)) { job = TAILQ_FIRST(&ki->kaio_syncready); TAILQ_REMOVE(&ki->kaio_syncready, job, list); AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); AIO_LOCK(ki); } AIO_UNLOCK(ki); } bool aio_cancel_cleared(struct kaiocb *job) { /* * The caller should hold the same queue lock held when * aio_clear_cancel_function() was called and set this flag * ensuring this check sees an up-to-date value. However, * there is no way to assert that. */ return ((job->jobflags & KAIOCB_CLEARED) != 0); } static bool aio_clear_cancel_function_locked(struct kaiocb *job) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); MPASS(job->cancel_fn != NULL); if (job->jobflags & KAIOCB_CANCELLING) { job->jobflags |= KAIOCB_CLEARED; return (false); } job->cancel_fn = NULL; return (true); } bool aio_clear_cancel_function(struct kaiocb *job) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_clear_cancel_function_locked(job); AIO_UNLOCK(ki); return (ret); } static bool aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); if (job->jobflags & KAIOCB_CANCELLED) return (false); job->cancel_fn = func; return (true); } bool aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_set_cancel_function_locked(job, func); AIO_UNLOCK(ki); return (ret); } void aio_complete(struct kaiocb *job, long status, int error) { struct kaioinfo *ki; struct proc *userp; job->uaiocb._aiocb_private.error = error; job->uaiocb._aiocb_private.status = status; userp = job->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); KASSERT(!(job->jobflags & KAIOCB_FINISHED), ("duplicate aio_complete")); job->jobflags |= KAIOCB_FINISHED; if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) { TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(userp, job); } AIO_UNLOCK(ki); } void aio_cancel(struct kaiocb *job) { aio_complete(job, -1, ECANCELED); } void aio_switch_vmspace(struct kaiocb *job) { vmspace_switch_aio(job->userproc->p_vmspace); } /* * The AIO daemon, most of the actual work is done in aio_process_*, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct kaiocb *job; struct aioproc *aiop; struct kaioinfo *ki; struct proc *p; struct vmspace *myvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = td->td_proc; myvm = vmspace_acquire_ref(p); KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aioproc = p; aiop->aioprocflags = 0; /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * Take daemon off of free queue */ if (aiop->aioprocflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((job = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); ki = job->userproc->p_aioinfo; job->handle_fn(job); mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; } /* * Disconnect from user address space. */ if (p->p_vmspace != myvm) { mtx_unlock(&aio_job_mtx); vmspace_switch_aio(myvm); mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aioprocflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && (aiop->aioprocflags & AIOP_FREE) && num_aio_procs > target_aio_procs) break; } TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); uma_zfree(aiop_zone, aiop); free_unr(aiod_unr, id); vmspace_free(myvm); KASSERT(p->p_vmspace == myvm, ("AIOD: bad vmspace for exiting daemon")); KASSERT(refcount_load(&myvm->vm_refcnt) > 1, ("AIOD: bad vm refcnt for exiting daemon: %d", refcount_load(&myvm->vm_refcnt))); kproc_exit(0); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead bio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qbio(struct proc *p, struct kaiocb *job) { struct aiocb *cb; struct file *fp; struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; struct bio **bios = NULL; off_t offset; int bio_cmd, error, i, iovcnt, opcode, poff, ref; vm_prot_t prot; bool use_unmapped; cb = &job->uaiocb; fp = job->fd_file; opcode = cb->aio_lio_opcode; if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV || opcode == LIO_READ || opcode == LIO_READV)) return (-1); if (fp == NULL || fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; if (vp->v_type != VCHR) return (-1); if (vp->v_bufobj.bo_bsize == 0) return (-1); bio_cmd = (opcode & LIO_WRITE) ? BIO_WRITE : BIO_READ; iovcnt = job->uiop->uio_iovcnt; if (iovcnt > max_buf_aio) return (-1); for (i = 0; i < iovcnt; i++) { if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0) return (-1); if (job->uiop->uio_iov[i].iov_len > maxphys) { error = -1; return (-1); } } offset = cb->aio_offset; ref = 0; csw = devvn_refthread(vp, &dev, &ref); if (csw == NULL) return (ENXIO); if ((csw->d_flags & D_DISK) == 0) { error = -1; goto unref; } if (job->uiop->uio_resid > dev->si_iosize_max) { error = -1; goto unref; } ki = p->p_aioinfo; job->error = 0; use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed; if (!use_unmapped) { AIO_LOCK(ki); if (ki->kaio_buffer_count + iovcnt > max_buf_aio) { AIO_UNLOCK(ki); error = EAGAIN; goto unref; } ki->kaio_buffer_count += iovcnt; AIO_UNLOCK(ki); } bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK); atomic_store_int(&job->nbio, iovcnt); for (i = 0; i < iovcnt; i++) { struct vm_page** pages; struct bio *bp; void *buf; size_t nbytes; int npages; buf = job->uiop->uio_iov[i].iov_base; nbytes = job->uiop->uio_iov[i].iov_len; bios[i] = g_alloc_bio(); bp = bios[i]; poff = (vm_offset_t)buf & PAGE_MASK; if (use_unmapped) { pbuf = NULL; pages = malloc(sizeof(vm_page_t) * (atop(round_page( nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO); } else { pbuf = uma_zalloc(pbuf_zone, M_WAITOK); BUF_KERNPROC(pbuf); pages = pbuf->b_pages; } bp->bio_length = nbytes; bp->bio_bcount = nbytes; bp->bio_done = aio_biowakeup; bp->bio_offset = offset; bp->bio_cmd = bio_cmd; bp->bio_dev = dev; bp->bio_caller1 = job; bp->bio_caller2 = pbuf; prot = VM_PROT_READ; if (opcode == LIO_READ || opcode == LIO_READV) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)buf, bp->bio_length, prot, pages, atop(maxphys) + 1); if (npages < 0) { if (pbuf != NULL) uma_zfree(pbuf_zone, pbuf); else free(pages, M_TEMP); error = EFAULT; g_destroy_bio(bp); i--; goto destroy_bios; } if (pbuf != NULL) { pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages); bp->bio_data = pbuf->b_data + poff; pbuf->b_npages = npages; atomic_add_int(&num_buf_aio, 1); } else { bp->bio_ma = pages; bp->bio_ma_n = npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; atomic_add_int(&num_unmapped_aio, 1); } offset += nbytes; } /* Perform transfer. */ for (i = 0; i < iovcnt; i++) csw->d_strategy(bios[i]); free(bios, M_TEMP); dev_relthread(dev, ref); return (0); destroy_bios: for (; i >= 0; i--) aio_biocleanup(bios[i]); free(bios, M_TEMP); unref: dev_relthread(dev, ref); return (error); } #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ nsig->sigev_notify = osig->sigev_notify; switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; break; default: return (EINVAL); } return (0); } static int aiocb_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob, int type __unused) { struct oaiocb *ojob; struct aiocb *kcb = &kjob->uaiocb; int error; bzero(kcb, sizeof(struct aiocb)); error = copyin(ujob, kcb, sizeof(struct oaiocb)); if (error) return (error); /* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */ ojob = (struct oaiocb *)kcb; return (convert_old_sigevent(&ojob->aio_sigevent, &kcb->aio_sigevent)); } #endif static int aiocb_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type) { struct aiocb *kcb = &kjob->uaiocb; int error; error = copyin(ujob, kcb, sizeof(struct aiocb)); if (error) return (error); if (type & LIO_VECTORED) { /* malloc a uio and copy in the iovec */ error = copyinuio(__DEVOLATILE(struct iovec*, kcb->aio_iov), kcb->aio_iovcnt, &kjob->uiop); } return (error); } static long aiocb_fetch_status(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.status)); } static long aiocb_fetch_error(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.error)); } static int aiocb_store_status(struct aiocb *ujob, long status) { return (suword(&ujob->_aiocb_private.status, status)); } static int aiocb_store_error(struct aiocb *ujob, long error) { return (suword(&ujob->_aiocb_private.error, error)); } static int aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) { return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); } static int aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword(ujobp, (long)ujob)); } static struct aiocb_ops aiocb_ops = { .aio_copyin = aiocb_copyin, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb_ops_osigevent = { .aio_copyin = aiocb_copyin_old_sigevent, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #endif /* * Queue a new AIO request. Choosing either the threaded or direct bio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, int type, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct file *fp = NULL; struct kaiocb *job; struct kaioinfo *ki; struct kevent kev; int opcode; int error; int fd, kqfd; int jid; u_short evflags; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; ops->store_status(ujob, -1); ops->store_error(ujob, 0); ops->store_kernelinfo(ujob, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= max_aio_queue_per_proc) { error = EAGAIN; goto err1; } job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); knlist_init_mtx(&job->klist, AIO_MTX(ki)); error = ops->aio_copyin(ujob, job, type); if (error) goto err2; if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { error = EINVAL; goto err2; } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { error = EINVAL; goto err2; } if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { error = EINVAL; goto err2; } /* Get the opcode. */ if (type == LIO_NOP) { switch (job->uaiocb.aio_lio_opcode) { case LIO_WRITE: case LIO_NOP: case LIO_READ: opcode = job->uaiocb.aio_lio_opcode; break; default: error = EINVAL; goto err2; } } else opcode = job->uaiocb.aio_lio_opcode = type; ksiginfo_init(&job->ksi); /* Save userspace address of the job info. */ job->ujob = ujob; /* * Validate the opcode and fetch the file object for the specified * file descriptor. * * XXXRW: Moved the opcode validation up here so that we don't * retrieve a file descriptor without knowing what the capabiltity * should be. */ fd = job->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: case LIO_WRITEV: error = fget_write(td, fd, &cap_pwrite_rights, &fp); break; case LIO_READ: case LIO_READV: error = fget_read(td, fd, &cap_pread_rights, &fp); break; case LIO_SYNC: case LIO_DSYNC: error = fget(td, fd, &cap_fsync_rights, &fp); break; case LIO_MLOCK: break; case LIO_NOP: error = fget(td, fd, &cap_no_rights, &fp); break; default: error = EINVAL; } if (error) goto err3; if ((opcode & LIO_SYNC) && fp->f_vnode == NULL) { error = EINVAL; goto err3; } if ((opcode == LIO_READ || opcode == LIO_READV || opcode == LIO_WRITE || opcode == LIO_WRITEV) && job->uaiocb.aio_offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { error = EINVAL; goto err3; } if (fp != NULL && fp->f_ops == &path_fileops) { error = EBADF; goto err3; } job->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; job->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = ops->store_kernelinfo(ujob, jid); if (error) { error = EINVAL; goto err3; } job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); MPASS(job->uiop == &job->uio || job->uiop == NULL); uma_zfree(aiocb_zone, job); return (0); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags; if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { error = EINVAL; goto err3; } kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue; memset(&kev, 0, sizeof(kev)); kev.ident = (uintptr_t)job->ujob; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; kev.data = (intptr_t)job; kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, M_WAITOK); if (error) goto err3; no_kqueue: ops->store_error(ujob, EINPROGRESS); job->uaiocb._aiocb_private.error = EINPROGRESS; job->userproc = p; job->cred = crhold(td->td_ucred); job->jobflags = KAIOCB_QUEUEING; job->lio = lj; if (opcode & LIO_VECTORED) { /* Use the uio copied in by aio_copyin */ MPASS(job->uiop != &job->uio && job->uiop != NULL); } else { /* Setup the inline uio */ job->iov[0].iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf; job->iov[0].iov_len = job->uaiocb.aio_nbytes; job->uio.uio_iov = job->iov; job->uio.uio_iovcnt = 1; job->uio.uio_resid = job->uaiocb.aio_nbytes; job->uio.uio_segflg = UIO_USERSPACE; job->uiop = &job->uio; } switch (opcode & (LIO_READ | LIO_WRITE)) { case LIO_READ: job->uiop->uio_rw = UIO_READ; break; case LIO_WRITE: job->uiop->uio_rw = UIO_WRITE; break; } job->uiop->uio_offset = job->uaiocb.aio_offset; job->uiop->uio_td = td; if (opcode == LIO_MLOCK) { aio_schedule(job, aio_process_mlock); error = 0; } else if (fp->f_ops->fo_aio_queue == NULL) error = aio_queue_file(fp, job); else error = fo_aio_queue(fp, job); if (error) goto err4; AIO_LOCK(ki); job->jobflags &= ~KAIOCB_QUEUEING; TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist); ki->kaio_count++; if (lj) lj->lioj_count++; atomic_add_int(&num_queue_count, 1); if (job->jobflags & KAIOCB_FINISHED) { /* * The queue callback completed the request synchronously. * The bulk of the completion is deferred in that case * until this point. */ aio_bio_done_notify(p, job); } else TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist); AIO_UNLOCK(ki); return (0); err4: crfree(job->cred); err3: if (fp) fdrop(fp, td); knlist_delete(&job->klist, curthread, 0); err2: if (job->uiop != &job->uio) free(job->uiop, M_IOV); uma_zfree(aiocb_zone, job); err1: ops->store_error(ujob, error); return (error); } static void aio_cancel_daemon_job(struct kaiocb *job) { mtx_lock(&aio_job_mtx); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&aio_jobs, job, list); mtx_unlock(&aio_job_mtx); aio_cancel(job); } void aio_schedule(struct kaiocb *job, aio_handle_fn_t *func) { mtx_lock(&aio_job_mtx); if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) { mtx_unlock(&aio_job_mtx); aio_cancel(job); return; } job->handle_fn = func; TAILQ_INSERT_TAIL(&aio_jobs, job, list); aio_kick_nowait(job->userproc); mtx_unlock(&aio_job_mtx); } static void aio_cancel_sync(struct kaiocb *job) { struct kaioinfo *ki; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); aio_cancel(job); } int aio_queue_file(struct file *fp, struct kaiocb *job) { struct kaioinfo *ki; struct kaiocb *job2; struct vnode *vp; struct mount *mp; int error; bool safe; ki = job->userproc->p_aioinfo; error = aio_qbio(job->userproc, job); if (error >= 0) return (error); safe = false; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VREG || vp->v_type == VDIR) { mp = fp->f_vnode->v_mount; if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0) safe = true; } } if (!(safe || enable_aio_unsafe)) { counted_warning(&unsafe_warningcnt, "is attempting to use unsafe AIO requests"); return (EOPNOTSUPP); } if (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) { aio_schedule(job, aio_process_rw); error = 0; } else if (job->uaiocb.aio_lio_opcode & LIO_SYNC) { AIO_LOCK(ki); TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) { if (job2->fd_file == job->fd_file && ((job2->uaiocb.aio_lio_opcode & LIO_SYNC) == 0) && job2->seqno < job->seqno) { job2->jobflags |= KAIOCB_CHECKSYNC; job->pending++; } } if (job->pending != 0) { if (!aio_set_cancel_function_locked(job, aio_cancel_sync)) { AIO_UNLOCK(ki); aio_cancel(job); return (0); } TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); return (0); } AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); error = 0; } else { error = EINVAL; } return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ static int kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; long status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_done, plist) { if (job->ujob == ujob) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_error(ujob, error); ops->store_status(ujob, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } int sys_aio_return(struct thread *td, struct aio_return_args *uap) { return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ static int kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, struct timespec *ts) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *firstjob, *job; int error, i, timo; timo = 0; if (ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); if (njoblist == 0) return (0); AIO_LOCK(ki); for (;;) { firstjob = NULL; error = 0; TAILQ_FOREACH(job, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (job->ujob == ujoblist[i]) { if (firstjob == NULL) firstjob = job; if (job->jobflags & KAIOCB_FINISHED) goto RETURN; } } } /* All tasks were finished. */ if (firstjob == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); return (error); } int sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct timespec ts, *tsp; struct aiocb **ujoblist; int error; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); if (error == 0) error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); free(ujoblist, M_AIOS); return (error); } /* * aio_cancel cancels any non-bio aio operations not currently in progress. */ int sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct kaiocb *job, *jobn; struct file *fp; int error; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ error = fget(td, uap->fd, &cap_no_rights, &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { if ((uap->fd == job->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == job->ujob))) { if (aio_cancel_job(p, ki, job)) { cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes * only. For a user mode async implementation, it would be best to do it in * a userland subroutine. */ static int kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_all, allist) { if (job->ujob == ujob) { if (job->jobflags & KAIOCB_FINISHED) td->td_retval[0] = job->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = ops->fetch_status(ujob); if (status == -1) { td->td_retval[0] = ops->fetch_error(ujob); return (0); } td->td_retval[0] = EINVAL; return (0); } int sys_aio_error(struct thread *td, struct aio_error_args *uap) { return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); } /* syscall - asynchronous read from a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb_ops_osigevent)); } #endif int sys_aio_read(struct thread *td, struct aio_read_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); } int sys_aio_readv(struct thread *td, struct aio_readv_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READV, &aiocb_ops)); } /* syscall - asynchronous write to a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops_osigevent)); } #endif int sys_aio_write(struct thread *td, struct aio_write_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); } int sys_aio_writev(struct thread *td, struct aio_writev_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITEV, &aiocb_ops)); } int sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); } static int kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, struct aiocb **acb_list, int nent, struct sigevent *sig, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocb *job; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int error; int nagain, nerror; int i; if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) return (EINVAL); if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; lj->lioj_signal.sigev_notify = SIGEV_NONE; knlist_init_mtx(&lj->klist, AIO_MTX(ki)); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (sig && (mode == LIO_NOWAIT)) { bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ memset(&kev, 0, sizeof(kev)); kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uacb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, M_WAITOK); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nagain = 0; nerror = 0; for (i = 0; i < nent; i++) { job = acb_list[i]; if (job != NULL) { error = aio_aqueue(td, job, lj, LIO_NOP, ops); if (error == EAGAIN) nagain++; else if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL | LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi, lj->lioj_count != 1); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); else if (nagain) return (EAGAIN); else return (error); } /* syscall - list directed I/O (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent osig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif /* syscall - list directed I/O (REALTIME) */ int sys_lio_listio(struct thread *td, struct lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig, sizeof(sig)); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, nent, sigp, &aiocb_ops); free(acb_list, M_LIO); return (error); } static void aio_biocleanup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; struct kaioinfo *ki; struct buf *pbuf = (struct buf *)bp->bio_caller2; /* Release mapping into kernel space. */ if (pbuf != NULL) { MPASS(pbuf->b_npages <= atop(maxphys) + 1); pmap_qremove((vm_offset_t)pbuf->b_data, pbuf->b_npages); vm_page_unhold_pages(pbuf->b_pages, pbuf->b_npages); uma_zfree(pbuf_zone, pbuf); atomic_subtract_int(&num_buf_aio, 1); ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); } else { MPASS(bp->bio_ma_n <= atop(maxphys) + 1); vm_page_unhold_pages(bp->bio_ma, bp->bio_ma_n); free(bp->bio_ma, M_TEMP); atomic_subtract_int(&num_unmapped_aio, 1); } g_destroy_bio(bp); } static void aio_biowakeup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; size_t nbytes; long bcount = bp->bio_bcount; long resid = bp->bio_resid; int error, opcode, nblks; int bio_error = bp->bio_error; uint16_t flags = bp->bio_flags; opcode = job->uaiocb.aio_lio_opcode; aio_biocleanup(bp); nbytes =bcount - resid; atomic_add_acq_long(&job->nbytes, nbytes); nblks = btodb(nbytes); error = 0; /* * If multiple bios experienced an error, the job will reflect the * error of whichever failed bio completed last. */ if (flags & BIO_ERROR) atomic_set_int(&job->error, bio_error); if (opcode & LIO_WRITE) atomic_add_int(&job->outblock, nblks); else atomic_add_int(&job->inblock, nblks); atomic_subtract_int(&job->nbio, 1); if (atomic_load_int(&job->nbio) == 0) { if (atomic_load_int(&job->error)) aio_complete(job, -1, job->error); else aio_complete(job, atomic_load_long(&job->nbytes), 0); } } /* syscall - wait for the next completion of an aio request */ static int kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, struct timespec *ts, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *job; struct aiocb *ujob; long error, status; int timo; ops->store_aiocb(ujobp, NULL); if (ts == NULL) { timo = 0; } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { timo = -1; } else { if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; job = NULL; AIO_LOCK(ki); while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) { if (timo == -1) { error = EWOULDBLOCK; break; } ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); ujob = job->ujob; status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_aiocb(ujobp, ujob); ops->store_error(ujob, error); ops->store_status(ujob, status); } else AIO_UNLOCK(ki); return (error); } int sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); } static int kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob, struct aiocb_ops *ops) { int listop; switch (op) { case O_SYNC: listop = LIO_SYNC; break; case O_DSYNC: listop = LIO_DSYNC; break; default: return (EINVAL); } return (aio_aqueue(td, ujob, NULL, listop, ops)); } int sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct kaiocb *job; job = (struct kaiocb *)(uintptr_t)kn->kn_sdata; /* * The job pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_aio = job; kn->kn_flags &= ~EV_FLAG1; knlist_add(&job->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_aio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct kaiocb *job = kn->kn_ptr.p_aio; kn->kn_data = job->uaiocb._aiocb_private.error; if (!(job->jobflags & KAIOCB_FINISHED)) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob *lj; lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_lio = lj; kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_lio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = kn->kn_ptr.p_lio; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } #ifdef COMPAT_FREEBSD32 #include #include +#include #include #include #include #include #include struct __aiocb_private32 { int32_t status; int32_t error; uint32_t kernelinfo; }; #ifdef COMPAT_FREEBSD6 typedef struct oaiocb32 { int aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent32 aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; } oaiocb32_t; #endif typedef struct aiocb32 { int32_t aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ int __spare__[2]; uint32_t __spare2__; int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; struct sigevent32 aio_sigevent; /* Signal to deliver */ } aiocb32_t; #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ CP(*osig, *nsig, sigev_notify); switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); break; default: return (EINVAL); } return (0); } static int aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob, int type __unused) { struct oaiocb32 job32; struct aiocb *kcb = &kjob->uaiocb; int error; bzero(kcb, sizeof(struct aiocb)); error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); /* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */ CP(job32, *kcb, aio_fildes); CP(job32, *kcb, aio_offset); PTRIN_CP(job32, *kcb, aio_buf); CP(job32, *kcb, aio_nbytes); CP(job32, *kcb, aio_lio_opcode); CP(job32, *kcb, aio_reqprio); CP(job32, *kcb, _aiocb_private.status); CP(job32, *kcb, _aiocb_private.error); PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo); return (convert_old_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent)); } #endif static int aiocb32_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type) { struct aiocb32 job32; struct aiocb *kcb = &kjob->uaiocb; struct iovec32 *iov32; int error; error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kcb, aio_fildes); CP(job32, *kcb, aio_offset); CP(job32, *kcb, aio_lio_opcode); if (type & LIO_VECTORED) { iov32 = PTRIN(job32.aio_iov); CP(job32, *kcb, aio_iovcnt); /* malloc a uio and copy in the iovec */ error = freebsd32_copyinuio(iov32, kcb->aio_iovcnt, &kjob->uiop); if (error) return (error); } else { PTRIN_CP(job32, *kcb, aio_buf); CP(job32, *kcb, aio_nbytes); } CP(job32, *kcb, aio_reqprio); CP(job32, *kcb, _aiocb_private.status); CP(job32, *kcb, _aiocb_private.error); PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo); error = convert_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent); return (error); } static long aiocb32_fetch_status(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.status)); } static long aiocb32_fetch_error(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.error)); } static int aiocb32_store_status(struct aiocb *ujob, long status) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.status, status)); } static int aiocb32_store_error(struct aiocb *ujob, long error) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.error, error)); } static int aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); } static int aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword32(ujobp, (long)ujob)); } static struct aiocb_ops aiocb32_ops = { .aio_copyin = aiocb32_copyin, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb32_ops_osigevent = { .aio_copyin = aiocb32_copyin_old_sigevent, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #endif int freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) { return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; struct aiocb **ujoblist; uint32_t *ujoblist32; int error, i; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); ujoblist32 = (uint32_t *)ujoblist; error = copyin(uap->aiocbp, ujoblist32, uap->nent * sizeof(ujoblist32[0])); if (error == 0) { for (i = uap->nent - 1; i >= 0; i--) ujoblist[i] = PTRIN(ujoblist32[i]); error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); } free(ujoblist, M_AIOS); return (error); } int freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) { return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_read(struct thread *td, struct freebsd6_freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops)); } int freebsd32_aio_readv(struct thread *td, struct freebsd32_aio_readv_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READV, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_write(struct thread *td, struct freebsd6_freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops)); } int freebsd32_aio_writev(struct thread *td, struct freebsd32_aio_writev_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITEV, &aiocb32_ops)); } int freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, &aiocb32_ops)); } int freebsd32_aio_waitcomplete(struct thread *td, struct freebsd32_aio_waitcomplete_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts32, sizeof(ts32)); if (error) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, &aiocb32_ops)); } int freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_lio_listio(struct thread *td, struct freebsd6_freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent32 osig; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent32(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif int freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct sigevent32 sig32; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig32, sizeof(sig32)); if (error) return (error); error = convert_sigevent32(&sig32, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops); free(acb_list, M_LIO); return (error); } #endif diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 44c9800ac493..4181502bdea8 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -1,2718 +1,2717 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, uint64_t fsflags, struct vfsoptlist **optlist); static void free_mntarg(struct mntarg *ma); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); static bool default_autoro = false; SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx_padalign __exclusive_cache_line mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); EVENTHANDLER_LIST_DEFINE(vfs_mounted); EVENTHANDLER_LIST_DEFINE(vfs_unmounted); static void mount_devctl_event(const char *type, struct mount *mp, bool donew); /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", NULL }; static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO); mp->mnt_ref = 0; mp->mnt_vfs_ops = 1; mp->mnt_rootvnode = NULL; return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); } static void vfs_mount_init(void *dummy __unused) { mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); } SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; if (opts == NULL) return; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } static int vfs_isopt_ro(const char *opt) { if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || strcmp(opt, "norw") == 0) return (1); return (0); } static int vfs_isopt_rw(const char *opt) { if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) return (1); return (0); } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { char *p; /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); while ((p = strchr(opt1, '.')) != NULL && !strncmp(opt1, opt2, ++p - opt1)) { opt2 += p - opt1; opt1 = p; /* "foo.noopt" vs. "foo.opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "foo.opt" vs. "foo.noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); } /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurrence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused, namelen, optlen; unsigned int i, iovcnt; int error; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; opt->pos = i / 2; opt->seen = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (namelen == 0 || opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. * * XXX: This function will keep a "nofoo" option in the new * options. E.g, if the option's canonical name is "foo", * "nofoo" ends up in the mount point's active options. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) { struct vfsopt *opt, *new; TAILQ_FOREACH(opt, oldopts, link) { new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = strdup(opt->name, M_MOUNT); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else new->value = NULL; new->len = opt->len; new->seen = opt->seen; TAILQ_INSERT_HEAD(toopts, new, link); } vfs_sanitizeopts(toopts); } /* * Mount a filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct nmount_args { struct iovec *iovp; unsigned int iovcnt; int flags; }; #endif int sys_nmount(struct thread *td, struct nmount_args *uap) { struct uio *auio; int error; u_int iovcnt; uint64_t flags; /* * Mount flags are now 64-bits. On 32-bit archtectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, uap->iovp, uap->iovcnt, flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) { CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, uap->iovcnt); return (EINVAL); } error = copyinuio(uap->iovp, iovcnt, &auio); if (error) { CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", __func__, error); return (error); } error = vfs_donmount(td, flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ /* * Get a reference on a mount point from a vnode. * * The vnode is allowed to be passed unlocked and race against dooming. Note in * such case there are no guarantees the referenced mount point will still be * associated with it after the function returns. */ struct mount * vfs_ref_from_vp(struct vnode *vp) { struct mount *mp; struct mount_pcpu *mpcpu; mp = atomic_load_ptr(&vp->v_mount); if (__predict_false(mp == NULL)) { return (mp); } if (vfs_op_thread_enter(mp, mpcpu)) { if (__predict_true(mp == vp->v_mount)) { vfs_mp_count_add_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); } else { vfs_op_thread_exit(mp, mpcpu); mp = NULL; } } else { MNT_ILOCK(mp); if (mp == vp->v_mount) { MNT_REF(mp); MNT_IUNLOCK(mp); } else { MNT_IUNLOCK(mp); mp = NULL; } } return (mp); } void vfs_ref(struct mount *mp) { struct mount_pcpu *mpcpu; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp, mpcpu)) { vfs_mp_count_add_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { struct mount_pcpu *mpcpu; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp, mpcpu)) { vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct ucred *cred) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); mp->mnt_kern_flag = 0; mp->mnt_flag = 0; mp->mnt_rootvnode = NULL; mp->mnt_vnodecovered = NULL; mp->mnt_op = NULL; mp->mnt_vfc = NULL; TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; TAILQ_INIT(&mp->mnt_lazyvnodelist); mp->mnt_lazyvnodelistsize = 0; if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || mp->mnt_writeopcount != 0) panic("%s: non-zero counters on new mp %p\n", __func__, mp); if (mp->mnt_vfs_ops != 1) panic("%s: vfs_ops should be 1 but %d found\n", __func__, mp->mnt_vfs_ops); (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(cred); mp->mnt_stat.f_owner = cred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); TAILQ_INIT(&mp->mnt_uppers); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { if (mp->mnt_vfs_ops == 0) panic("%s: entered with zero vfs_ops\n", __func__); vfs_assert_mount_counters(mp); MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } while (mp->mnt_ref) msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); KASSERT(mp->mnt_ref == 0, ("%s: invalid refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_lazyvnodelistsize != 0) panic("vfs_mount_destroy: nonzero lazyvnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); if (mp->mnt_vfs_ops != 1) panic("%s: vfs_ops should be 1 but %d found\n", __func__, mp->mnt_vfs_ops); if (mp->mnt_rootvnode != NULL) panic("%s: mount point still has a root vnode %p\n", __func__, mp->mnt_rootvnode); if (mp->mnt_vnodecovered != NULL) vrele(mp->mnt_vnodecovered); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } static bool vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error) { /* This is an upgrade of an exisiting mount. */ if ((fsflags & MNT_UPDATE) != 0) return (false); /* This is already an R/O mount. */ if ((fsflags & MNT_RDONLY) != 0) return (false); switch (error) { case ENODEV: /* generic, geom, ... */ case EACCES: /* cam/scsi, ... */ case EROFS: /* md, mmcsd, ... */ /* * These errors can be returned by the storage layer to signal * that the media is read-only. No harm in the R/O mount * attempt if the error was returned for some other reason. */ return (true); default: return (false); } } int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *tmp_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; bool autoro; errmsg = fspath = NULL; errmsg_len = fspathlen = 0; errmsg_pos = -1; autoro = default_autoro; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { int do_freeopt = 0; if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; do_freeopt = 1; } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; do_freeopt = 1; } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; do_freeopt = 1; } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) { fsflags &= ~MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "rw") == 0) { fsflags &= ~MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "ro") == 0) { fsflags |= MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "autoro") == 0) { do_freeopt = 1; autoro = true; } else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; else if (strcmp(opt->name, "automounted") == 0) { fsflags |= MNT_AUTOMOUNTED; do_freeopt = 1; } else if (strcmp(opt->name, "nocover") == 0) { fsflags |= MNT_NOCOVER; do_freeopt = 1; } else if (strcmp(opt->name, "cover") == 0) { fsflags &= ~MNT_NOCOVER; do_freeopt = 1; } else if (strcmp(opt->name, "emptydir") == 0) { fsflags |= MNT_EMPTYDIR; do_freeopt = 1; } else if (strcmp(opt->name, "noemptydir") == 0) { fsflags &= ~MNT_EMPTYDIR; do_freeopt = 1; } if (do_freeopt) vfs_freeopt(optlist, opt); } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { error = ENAMETOOLONG; goto bail; } error = vfs_domount(td, fstype, fspath, fsflags, &optlist); if (error == ENOENT) { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } /* * See if we can mount in the read-only mode if the error code suggests * that it could be possible and the mount options allow for that. * Never try it if "[no]{ro|rw}" has been explicitly requested and not * overridden by "autoro". */ if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) { printf("%s: R/W mount failed, possibly R/O media," " trying R/O mount\n", __func__); fsflags |= MNT_RDONLY; error = vfs_domount(td, fstype, fspath, fsflags, &optlist); } bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (optlist != NULL) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int sys_mount(struct thread *td, struct mount_args *uap) { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; uint64_t flags; int error; /* * Mount flags are now 64-bits. On 32-bit architectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG_TEXT(fstype); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) return (ENOENT); if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 && vfsp->vfc_vfsops_sd->vfs_cmount == NULL) || ((vfsp->vfc_flags & VFCF_SBDRY) == 0 && vfsp->vfc_vfsops->vfs_cmount == NULL)) return (EOPNOTSUPP); ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); if ((vfsp->vfc_flags & VFCF_SBDRY) != 0) return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags)); return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags)); } /* * vfs_domount_first(): first file system mount (not update) */ static int vfs_domount_first( struct thread *td, /* Calling thread. */ struct vfsconf *vfsp, /* File system type. */ char *fspath, /* Mount path. */ struct vnode *vp, /* Vnode to be covered. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vattr va; struct mount *mp; struct vnode *newdp, *rootvp; int error, error1; bool unmounted; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); /* * If the jail of the calling thread lacks permission for this type of * file system, or is trying to cover its own root, deny immediately. */ if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred, vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) { vput(vp); return (EPERM); } /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred); if (error == 0 && va.va_uid != td->td_ucred->cr_uid) error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN); if (error == 0) error = vinvalbuf(vp, V_SAVE, 0, 0); if (error == 0 && vp->v_type != VDIR) error = ENOTDIR; if (error == 0 && (fsflags & MNT_EMPTYDIR) != 0) error = vfs_emptydir(vp); if (error == 0) { VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) vp->v_iflag |= VI_MOUNT; else error = EBUSY; VI_UNLOCK(vp); } if (error != 0) { vput(vp); return (error); } vn_seqc_write_begin(vp); VOP_UNLOCK(vp); /* Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = *optlist; /* Set the mount level flags. */ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error1 = 0; unmounted = true; if ((error = VFS_MOUNT(mp)) != 0 || (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 || (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) { rootvp = NULL; if (error1 != 0) { MPASS(error == 0); rootvp = vfs_cache_root_clear(mp); if (rootvp != NULL) { vhold(rootvp); vrele(rootvp); } (void)vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_UNMOUNTF; MNT_IUNLOCK(mp); VFS_PURGE(mp); error = VFS_UNMOUNT(mp, 0); vn_finished_write(mp); if (error != 0) { printf( "failed post-mount (%d): rollback unmount returned %d\n", error1, error); unmounted = false; } error = error1; } vfs_unbusy(mp); mp->mnt_vnodecovered = NULL; if (unmounted) { /* XXXKIB wait for mnt_lockref drain? */ vfs_mount_destroy(mp); } VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vdrop(rootvp); } vn_seqc_write_end(vp); vrele(vp); return (error); } vn_seqc_write_begin(newdp); VOP_UNLOCK(newdp); if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; /* * Prevent external consumers of mount options from reading mnt_optnew. */ mp->mnt_optnew = NULL; MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); cache_purge(vp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); vp->v_mountedhere = mp; VI_UNLOCK(vp); /* Place the new filesystem at the end of the mount list. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY); VOP_UNLOCK(vp); EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp); mount_devctl_event("MOUNT", mp, false); mountcheckdirs(vp, newdp); vn_seqc_write_end(vp); vn_seqc_write_end(newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); vfs_op_exit(mp); vfs_unbusy(mp); return (0); } /* * vfs_domount_update(): update of mounted file system */ static int vfs_domount_update( struct thread *td, /* Calling thread. */ struct vnode *vp, /* Mount point vnode. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct export_args export; struct o2export_args o2export; struct vnode *rootvp; void *bufp; struct mount *mp; int error, export_error, i, len; uint64_t flag; gid_t *grps; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); mp = vp->v_mount; if ((vp->v_vflag & VV_ROOT) == 0) { if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) == 0) error = EXDEV; else error = EINVAL; vput(vp); return (error); } /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ flag = mp->mnt_flag; if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error != 0) { vput(vp); return (error); } if (vfs_busy(mp, MBF_NOWAIT)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp); vfs_op_enter(mp); vn_seqc_write_begin(vp); rootvp = NULL; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); error = EBUSY; goto end; } mp->mnt_flag &= ~MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; rootvp = vfs_cache_root_clear(mp); MNT_IUNLOCK(mp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); export_error = 0; /* Process the export option. */ if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, &len) == 0) { /* Assume that there is only 1 ABI for each length. */ switch (len) { case (sizeof(struct oexport_args)): bzero(&o2export, sizeof(o2export)); /* FALLTHROUGH */ case (sizeof(o2export)): bcopy(bufp, &o2export, len); export.ex_flags = (uint64_t)o2export.ex_flags; export.ex_root = o2export.ex_root; export.ex_uid = o2export.ex_anon.cr_uid; export.ex_groups = NULL; export.ex_ngroups = o2export.ex_anon.cr_ngroups; if (export.ex_ngroups > 0) { if (export.ex_ngroups <= XU_NGROUPS) { export.ex_groups = malloc( export.ex_ngroups * sizeof(gid_t), M_TEMP, M_WAITOK); for (i = 0; i < export.ex_ngroups; i++) export.ex_groups[i] = o2export.ex_anon.cr_groups[i]; } else export_error = EINVAL; } else if (export.ex_ngroups < 0) export_error = EINVAL; export.ex_addr = o2export.ex_addr; export.ex_addrlen = o2export.ex_addrlen; export.ex_mask = o2export.ex_mask; export.ex_masklen = o2export.ex_masklen; export.ex_indexfile = o2export.ex_indexfile; export.ex_numsecflavors = o2export.ex_numsecflavors; if (export.ex_numsecflavors < MAXSECFLAVORS) { for (i = 0; i < export.ex_numsecflavors; i++) export.ex_secflavors[i] = o2export.ex_secflavors[i]; } else export_error = EINVAL; if (export_error == 0) export_error = vfs_export(mp, &export); free(export.ex_groups, M_TEMP); break; case (sizeof(export)): bcopy(bufp, &export, len); grps = NULL; if (export.ex_ngroups > 0) { if (export.ex_ngroups <= NGROUPS_MAX) { grps = malloc(export.ex_ngroups * sizeof(gid_t), M_TEMP, M_WAITOK); export_error = copyin(export.ex_groups, grps, export.ex_ngroups * sizeof(gid_t)); if (export_error == 0) export.ex_groups = grps; } else export_error = EINVAL; } else if (export.ex_ngroups == 0) export.ex_groups = NULL; else export_error = EINVAL; if (export_error == 0) export_error = vfs_export(mp, &export); free(grps, M_TEMP); break; default: export_error = EINVAL; break; } } MNT_ILOCK(mp); if (error == 0) { mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); } else { /* * If we fail, restore old mount flags. MNT_QUOTA is special, * because it is not part of MNT_UPDATEMASK, but it could have * changed in the meantime if quotactl(2) was called. * All in all we want current value of MNT_QUOTA, not the old * one. */ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); } if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if (error != 0) goto end; mount_devctl_event("REMOUNT", mp, true); if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); else vfs_deallocate_syncvnode(mp); end: vfs_op_exit(mp); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vrele(rootvp); } vn_seqc_write_end(vp); vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error != 0 ? error : export_error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vfsconf *vfsp; struct nameidata nd; struct vnode *vp; char *pathbuf; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) { if ((vfsp = vfs_byname(fstype)) == NULL) return (ENODEV); } else { if ((vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) return (error); } } /* * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if ((fsflags & MNT_UPDATE) == 0) { if ((vp->v_vflag & VV_ROOT) != 0 && (fsflags & MNT_NOCOVER) != 0) { vput(vp); return (EBUSY); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); strcpy(pathbuf, fspath); error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); if (error == 0) { error = vfs_domount_first(td, vfsp, pathbuf, vp, fsflags, optlist); } free(pathbuf, M_TEMP); } else error = vfs_domount_update(td, vp, fsflags, optlist); return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int sys_unmount(struct thread *td, struct unmount_args *uap) { return (kern_unmount(td, uap->path, uap->flags)); } int kern_unmount(struct thread *td, const char *path, int flags) { struct nameidata nd; struct mount *mp; char *pathbuf; int error, id0, id1; AUDIT_ARG_VALUE(flags); if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } if (flags & MNT_BYFSID) { AUDIT_ARG_TEXT(pathbuf); /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } else { /* * Try to find global path for path argument. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, pathbuf, td); if (namei(&nd) == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, MNAMELEN); if (error == 0) vput(nd.ni_vp); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ return ((flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { vfs_rel(mp); return (EINVAL); } error = dounmount(mp, flags, td); return (error); } /* * Return error if any of the vnodes, ignoring the root vnode * and the syncer vnode, have non-zero usecount. * * This function is purely advisory - it can return false positives * and negatives. */ static int vfs_check_usecounts(struct mount *mp) { struct vnode *vp, *mvp; MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && vp->v_usecount != 0) { VI_UNLOCK(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (EBUSY); } VI_UNLOCK(vp); } return (0); } static void dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) { mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag &= ~mntkflags; if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); } vn_finished_write(mp); } /* * There are various reference counters associated with the mount point. * Normally it is permitted to modify them without taking the mnt ilock, * but this behavior can be temporarily disabled if stable value is needed * or callers are expected to block (e.g. to not allow new users during * forced unmount). */ void vfs_op_enter(struct mount *mp) { struct mount_pcpu *mpcpu; int cpu; MNT_ILOCK(mp); mp->mnt_vfs_ops++; if (mp->mnt_vfs_ops > 1) { MNT_IUNLOCK(mp); return; } vfs_op_barrier_wait(mp); CPU_FOREACH(cpu) { mpcpu = vfs_mount_pcpu_remote(mp, cpu); mp->mnt_ref += mpcpu->mntp_ref; mpcpu->mntp_ref = 0; mp->mnt_lockref += mpcpu->mntp_lockref; mpcpu->mntp_lockref = 0; mp->mnt_writeopcount += mpcpu->mntp_writeopcount; mpcpu->mntp_writeopcount = 0; } if (mp->mnt_ref <= 0 || mp->mnt_lockref < 0 || mp->mnt_writeopcount < 0) panic("%s: invalid count(s) on mp %p: ref %d lockref %d writeopcount %d\n", __func__, mp, mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount); MNT_IUNLOCK(mp); vfs_assert_mount_counters(mp); } void vfs_op_exit_locked(struct mount *mp) { mtx_assert(MNT_MTX(mp), MA_OWNED); if (mp->mnt_vfs_ops <= 0) panic("%s: invalid vfs_ops count %d for mp %p\n", __func__, mp->mnt_vfs_ops, mp); mp->mnt_vfs_ops--; } void vfs_op_exit(struct mount *mp) { MNT_ILOCK(mp); vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); } struct vfs_op_barrier_ipi { struct mount *mp; struct smp_rendezvous_cpus_retry_arg srcra; }; static void vfs_op_action_func(void *arg) { struct vfs_op_barrier_ipi *vfsopipi; struct mount *mp; vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); mp = vfsopipi->mp; if (!vfs_op_thread_entered(mp)) smp_rendezvous_cpus_done(arg); } static void vfs_op_wait_func(void *arg, int cpu) { struct vfs_op_barrier_ipi *vfsopipi; struct mount *mp; struct mount_pcpu *mpcpu; vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); mp = vfsopipi->mp; mpcpu = vfs_mount_pcpu_remote(mp, cpu); while (atomic_load_int(&mpcpu->mntp_thread_in_ops)) cpu_spinwait(); } void vfs_op_barrier_wait(struct mount *mp) { struct vfs_op_barrier_ipi vfsopipi; vfsopipi.mp = mp; smp_rendezvous_cpus_retry(all_cpus, smp_no_rendezvous_barrier, vfs_op_action_func, smp_no_rendezvous_barrier, vfs_op_wait_func, &vfsopipi.srcra); } #ifdef DIAGNOSTIC void vfs_assert_mount_counters(struct mount *mp) { struct mount_pcpu *mpcpu; int cpu; if (mp->mnt_vfs_ops == 0) return; CPU_FOREACH(cpu) { mpcpu = vfs_mount_pcpu_remote(mp, cpu); if (mpcpu->mntp_ref != 0 || mpcpu->mntp_lockref != 0 || mpcpu->mntp_writeopcount != 0) vfs_dump_mount_counters(mp); } } void vfs_dump_mount_counters(struct mount *mp) { struct mount_pcpu *mpcpu; int ref, lockref, writeopcount; int cpu; printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); printf(" ref : "); ref = mp->mnt_ref; CPU_FOREACH(cpu) { mpcpu = vfs_mount_pcpu_remote(mp, cpu); printf("%d ", mpcpu->mntp_ref); ref += mpcpu->mntp_ref; } printf("\n"); printf(" lockref : "); lockref = mp->mnt_lockref; CPU_FOREACH(cpu) { mpcpu = vfs_mount_pcpu_remote(mp, cpu); printf("%d ", mpcpu->mntp_lockref); lockref += mpcpu->mntp_lockref; } printf("\n"); printf("writeopcount: "); writeopcount = mp->mnt_writeopcount; CPU_FOREACH(cpu) { mpcpu = vfs_mount_pcpu_remote(mp, cpu); printf("%d ", mpcpu->mntp_writeopcount); writeopcount += mpcpu->mntp_writeopcount; } printf("\n"); printf("counter struct total\n"); printf("ref %-5d %-5d\n", mp->mnt_ref, ref); printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); panic("invalid counts on struct mount"); } #endif int vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) { struct mount_pcpu *mpcpu; int cpu, sum; switch (which) { case MNT_COUNT_REF: sum = mp->mnt_ref; break; case MNT_COUNT_LOCKREF: sum = mp->mnt_lockref; break; case MNT_COUNT_WRITEOPCOUNT: sum = mp->mnt_writeopcount; break; } CPU_FOREACH(cpu) { mpcpu = vfs_mount_pcpu_remote(mp, cpu); switch (which) { case MNT_COUNT_REF: sum += mpcpu->mntp_ref; break; case MNT_COUNT_LOCKREF: sum += mpcpu->mntp_lockref; break; case MNT_COUNT_WRITEOPCOUNT: sum += mpcpu->mntp_writeopcount; break; } } return (sum); } /* * Do the actual filesystem unmount. */ int dounmount(struct mount *mp, int flags, struct thread *td) { struct vnode *coveredvp, *rootvp; int error; uint64_t async_flag; int mnt_gen_r; if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); vfs_rel(mp); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error != 0) { if (coveredvp != NULL) { VOP_UNLOCK(coveredvp); vdrop(coveredvp); } vfs_rel(mp); return (error); } vfs_op_enter(mp); vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || (mp->mnt_flag & MNT_UPDATE) != 0 || !TAILQ_EMPTY(&mp->mnt_uppers)) { dounmount_cleanup(mp, coveredvp, 0); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT; rootvp = vfs_cache_root_clear(mp); if (coveredvp != NULL) vn_seqc_write_begin(coveredvp); if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { vn_seqc_write_end(coveredvp); dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vrele(rootvp); } return (error); } } /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) { mp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(mp); /* * Must be done after setting MNTK_UNMOUNTF and before * waiting for mnt_lockref to become 0. */ VFS_PURGE(mp); MNT_ILOCK(mp); } error = 0; if (mp->mnt_lockref) { mp->mnt_kern_flag |= MNTK_DRAINING; error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, "mount drain", 0); } MNT_IUNLOCK(mp); KASSERT(mp->mnt_lockref == 0, ("%s: invalid lock refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); KASSERT(error == 0, ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); /* * We want to keep the vnode around so that we can vn_seqc_write_end * after we are done with unmount. Downgrade our reference to a mere * hold count so that we don't interefere with anything. */ if (rootvp != NULL) { vhold(rootvp); vrele(rootvp); } if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_periodic(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_deallocate_syncvnode(mp); error = VFS_UNMOUNT(mp, flags); vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0) { MNT_IUNLOCK(mp); vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp) { vn_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); vdrop(coveredvp); } if (rootvp != NULL) { vn_seqc_write_end(rootvp); vdrop(rootvp); } return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { VI_LOCK(coveredvp); vn_irflag_unset_locked(coveredvp, VIRF_MOUNTPOINT); coveredvp->v_mountedhere = NULL; vn_seqc_write_end_locked(coveredvp); VI_UNLOCK(coveredvp); VOP_UNLOCK(coveredvp); vdrop(coveredvp); } mount_devctl_event("UNMOUNT", mp, false); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vdrop(rootvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (rootvnode != NULL && mp == rootvnode->v_mount) { vrele(rootvnode); rootvnode = NULL; } if (mp == rootdevmp) rootdevmp = NULL; vfs_mount_destroy(mp); return (0); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) { va_list ap; int error, len; char *errmsg; error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; snprintf(errmsg, sizeof(errmsg), "mount option <%s> is unknown", p); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); break; } } if (opt == NULL) printf("%s\n", errmsg); } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len) { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; if (opts == NULL) return (-1); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; return (opt->pos); } } return (-1); } int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) { char *opt_value, *vtp; quad_t iv; int error, opt_len; error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); if (error != 0) return (error); if (opt_len == 0 || opt_value == NULL) return (EINVAL); if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') return (EINVAL); iv = strtoq(opt_value, &vtp, 0); if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) return (EINVAL); if (iv < 0) return (EINVAL); switch (vtp[0]) { case 't': case 'T': iv *= 1024; /* FALLTHROUGH */ case 'g': case 'G': iv *= 1024; /* FALLTHROUGH */ case 'm': case 'M': iv *= 1024; /* FALLTHROUGH */ case 'k': case 'K': iv *= 1024; case '\0': break; default: return (EINVAL); } *value = iv; return (0); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || ((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len != len) return (EINVAL); bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len < len) return (EINVAL); opt->len = len; bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = strlen(value) + 1; else if (strlcpy(opt->value, value, opt->len) >= opt->len) return (EINVAL); return (0); } return (ENOENT); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len) { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } int __vfs_statfs(struct mount *mp, struct statfs *sbp) { /* * Filesystems only fill in part of the structure for updates, we * have to read the entirety first to get all content. */ if (sbp != &mp->mnt_stat) memcpy(sbp, &mp->mnt_stat, sizeof(*sbp)); /* * Set these in case the underlying filesystem fails to do so. */ sbp->f_version = STATFS_VERSION; sbp->f_namemax = NAME_MAX; sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; return (mp->mnt_op->vfs_statfs(mp, sbp)); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, treat value as a C string. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, uint64_t flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); } va_end(ap); error = kernel_mount(ma, flags); return (error); } /* Map from mount options to printable formats. */ static struct mntoptnames optnames[] = { MNTOPT_NAMES }; #define DEVCTL_LEN 1024 static void mount_devctl_event(const char *type, struct mount *mp, bool donew) { const uint8_t *cp; struct mntoptnames *fp; struct sbuf sb; struct statfs *sfp = &mp->mnt_stat; char *buf; buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT); if (buf == NULL) return; sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN); sbuf_cpy(&sb, "mount-point=\""); devctl_safe_quote_sb(&sb, sfp->f_mntonname); sbuf_cat(&sb, "\" mount-dev=\""); devctl_safe_quote_sb(&sb, sfp->f_mntfromname); sbuf_cat(&sb, "\" mount-type=\""); devctl_safe_quote_sb(&sb, sfp->f_fstypename); sbuf_cat(&sb, "\" fsid=0x"); cp = (const uint8_t *)&sfp->f_fsid.val[0]; for (int i = 0; i < sizeof(sfp->f_fsid); i++) sbuf_printf(&sb, "%02x", cp[i]); sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner); for (fp = optnames; fp->o_opt != 0; fp++) { if ((mp->mnt_flag & fp->o_opt) != 0) { sbuf_cat(&sb, fp->o_name); sbuf_putc(&sb, ';'); } } sbuf_putc(&sb, '"'); sbuf_finish(&sb); /* * Options are not published because the form of the options depends on * the file system and may include binary data. In addition, they don't * necessarily provide enough useful information to be actionable when * devd processes them. */ if (sbuf_error(&sb) == 0) devctl_notify("VFS", "FS", type, sbuf_data(&sb)); sbuf_delete(&sb); free(buf, M_MOUNT); } /* * Force remount specified mount point to read-only. The argument * must be busied to avoid parallel unmount attempts. * * Intended use is to prevent further writes if some metadata * inconsistency is detected. Note that the function still flushes * all cached metadata and data for the mount point, which might be * not always suitable. */ int vfs_remount_ro(struct mount *mp) { struct vfsoptlist *opts; struct vfsopt *opt; struct vnode *vp_covered, *rootvp; int error; KASSERT(mp->mnt_lockref > 0, ("vfs_remount_ro: mp %p is not busied", mp)); KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0, ("vfs_remount_ro: mp %p is being unmounted (and busy?)", mp)); rootvp = NULL; vp_covered = mp->mnt_vnodecovered; error = vget(vp_covered, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) return (error); VI_LOCK(vp_covered); if ((vp_covered->v_iflag & VI_MOUNT) != 0) { VI_UNLOCK(vp_covered); vput(vp_covered); return (EBUSY); } vp_covered->v_iflag |= VI_MOUNT; VI_UNLOCK(vp_covered); vfs_op_enter(mp); vn_seqc_write_begin(vp_covered); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) != 0) { MNT_IUNLOCK(mp); error = EBUSY; goto out; } mp->mnt_flag |= MNT_UPDATE | MNT_FORCE | MNT_RDONLY; rootvp = vfs_cache_root_clear(mp); MNT_IUNLOCK(mp); opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK | M_ZERO); TAILQ_INIT(opts); opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK | M_ZERO); opt->name = strdup("ro", M_MOUNT); opt->value = NULL; TAILQ_INSERT_TAIL(opts, opt, link); vfs_mergeopts(opts, mp->mnt_opt); mp->mnt_optnew = opts; error = VFS_MOUNT(mp); if (error == 0) { MNT_ILOCK(mp); mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE); MNT_IUNLOCK(mp); vfs_deallocate_syncvnode(mp); if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; } else { MNT_ILOCK(mp); mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE | MNT_RDONLY); MNT_IUNLOCK(mp); vfs_freeopts(mp->mnt_optnew); } mp->mnt_optnew = NULL; out: vfs_op_exit(mp); VI_LOCK(vp_covered); vp_covered->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp_covered); vput(vp_covered); vn_seqc_write_end(vp_covered); if (rootvp != NULL) { vn_seqc_write_end(rootvp); vrele(rootvp); } return (error); } /* * Suspend write operations on all local writeable filesystems. Does * full sync of them in the process. * * Iterate over the mount points in reverse order, suspending most * recently mounted filesystems first. It handles a case where a * filesystem mounted from a md(4) vnode-backed device should be * suspended before the filesystem that owns the vnode. */ void suspend_all_fs(void) { struct mount *mp; int error; mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT); if (error != 0) continue; if ((mp->mnt_flag & (MNT_RDONLY | MNT_LOCAL)) != MNT_LOCAL || (mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { mtx_lock(&mountlist_mtx); vfs_unbusy(mp); continue; } error = vfs_write_suspend(mp, 0); if (error == 0) { MNT_ILOCK(mp); MPASS((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0); mp->mnt_kern_flag |= MNTK_SUSPEND_ALL; MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); } else { printf("suspend of %s failed, error %d\n", mp->mnt_stat.f_mntonname, error); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); } } mtx_unlock(&mountlist_mtx); } void resume_all_fs(void) { struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if ((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0) continue; mtx_unlock(&mountlist_mtx); MNT_ILOCK(mp); MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) != 0); mp->mnt_kern_flag &= ~MNTK_SUSPEND_ALL; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); } diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c index 7e88a58fdb90..1886d3b7aca3 100644 --- a/sys/kern/vfs_mountroot.c +++ b/sys/kern/vfs_mountroot.c @@ -1,1169 +1,1168 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2010 Marcel Moolenaar * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_rootdevname.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[][ :[] ...] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store * * If the environment variable vfs.root.mountfrom is a space separated list, * each list element is tried in turn and the root filesystem will be mounted * from the first one that succeeds. * * The environment variable vfs.root.mountfrom.options is a comma delimited * set of string mount options. These mount options must be parseable * by nmount() in the kernel. */ static int parse_mount(char **); static struct mntarg *parse_mountroot_options(struct mntarg *, const char *); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS); static void vfs_mountroot_wait(void); static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev); /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * Mount of the system's /dev. */ struct mount *rootdevmp; char *rootdevnames[2] = {NULL, NULL}; struct mtx root_holds_mtx; MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF); static TAILQ_HEAD(, root_hold_token) root_holds = TAILQ_HEAD_INITIALIZER(root_holds); enum action { A_CONTINUE, A_PANIC, A_REBOOT, A_RETRY }; enum rh_flags { RH_FREE, RH_ALLOC, RH_ARG, }; static enum action root_mount_onfail = A_CONTINUE; static int root_mount_mddev; static int root_mount_complete; /* By default wait up to 3 seconds for devices to appear. */ static int root_mount_timeout = 3; TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout); static int root_mount_always_wait = 0; SYSCTL_INT(_vfs, OID_AUTO, root_mount_always_wait, CTLFLAG_RDTUN, &root_mount_always_wait, 0, "Wait for root mount holds even if the root device already exists"); SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_root_mount_hold, "A", "List of root mount hold tokens"); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct root_hold_token *h; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); mtx_lock(&root_holds_mtx); TAILQ_FOREACH(h, &root_holds, list) { if (h != TAILQ_FIRST(&root_holds)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s", h->who); } mtx_unlock(&root_holds_mtx); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->flags = RH_ALLOC; h->who = identifier; mtx_lock(&root_holds_mtx); TSHOLD("root mount"); TAILQ_INSERT_TAIL(&root_holds, h, list); mtx_unlock(&root_holds_mtx); return (h); } void root_mount_hold_token(const char *identifier, struct root_hold_token *h) { #ifdef INVARIANTS struct root_hold_token *t; #endif h->flags = RH_ARG; h->who = identifier; mtx_lock(&root_holds_mtx); #ifdef INVARIANTS TAILQ_FOREACH(t, &root_holds, list) { if (t == h) { panic("Duplicate mount hold by '%s' on %p", identifier, h); } } #endif TSHOLD("root mount"); TAILQ_INSERT_TAIL(&root_holds, h, list); mtx_unlock(&root_holds_mtx); } void root_mount_rel(struct root_hold_token *h) { if (h == NULL || h->flags == RH_FREE) return; mtx_lock(&root_holds_mtx); TAILQ_REMOVE(&root_holds, h, list); TSRELEASE("root mount"); wakeup(&root_holds); mtx_unlock(&root_holds_mtx); if (h->flags == RH_ALLOC) { free(h, M_DEVBUF); } else h->flags = RH_FREE; } int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } static void set_rootvnode(void) { if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode)) panic("set_rootvnode: Cannot find root vnode"); VOP_UNLOCK(rootvnode); pwd_set_rootvnode(); } static int vfs_mountroot_devfs(struct thread *td, struct mount **mpp) { struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp; int error; *mpp = NULL; if (rootdevmp != NULL) { /* * Already have /dev; this happens during rerooting. */ error = vfs_busy(rootdevmp, 0); if (error != 0) return (error); *mpp = rootdevmp; } else { vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return (ENOENT); mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred); error = VFS_MOUNT(mp); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return (error); error = VFS_STATFS(mp, &mp->mnt_stat); KASSERT(error == 0, ("VFS_STATFS(devfs) failed %d", error)); if (error) return (error); opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); *mpp = mp; rootdevmp = mp; vfs_op_exit(mp); } set_rootvnode(); error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); return (error); } static void vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs) { struct nameidata nd; struct mount *mporoot, *mpnroot; struct vnode *vp, *vporoot, *vpdevfs; char *fspath; int error; mpnroot = TAILQ_NEXT(mpdevfs, mnt_list); /* Shuffle the mountlist. */ mtx_lock(&mountlist_mtx); mporoot = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list); if (mporoot != mpdevfs) { TAILQ_REMOVE(&mountlist, mpnroot, mnt_list); TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list); } TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list); mtx_unlock(&mountlist_mtx); cache_purgevfs(mporoot); if (mporoot != mpdevfs) cache_purgevfs(mpdevfs); if (VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot)) panic("vfs_mountroot_shuffle: Cannot find root vnode"); VI_LOCK(vporoot); vporoot->v_iflag &= ~VI_MOUNT; vn_irflag_unset_locked(vporoot, VIRF_MOUNTPOINT); vporoot->v_mountedhere = NULL; VI_UNLOCK(vporoot); mporoot->mnt_flag &= ~MNT_ROOTFS; mporoot->mnt_vnodecovered = NULL; vput(vporoot); /* Set up the new rootvnode, and purge the cache */ mpnroot->mnt_vnodecovered = NULL; set_rootvnode(); cache_purgevfs(rootvnode->v_mount); if (mporoot != mpdevfs) { /* Remount old root under /.mount or /mnt */ fspath = "/.mount"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); fspath = "/mnt"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); } if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { cache_purge(vp); VI_LOCK(vp); mporoot->mnt_vnodecovered = vp; vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); vp->v_mountedhere = mporoot; strlcpy(mporoot->mnt_stat.f_mntonname, fspath, MNAMELEN); VI_UNLOCK(vp); VOP_UNLOCK(vp); } else vput(vp); } NDFREE(&nd, NDF_ONLY_PNBUF); if (error) printf("mountroot: unable to remount previous root " "under /.mount or /mnt (error %d)\n", error); } /* Remount devfs under /dev */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { vpdevfs = mpdevfs->mnt_vnodecovered; if (vpdevfs != NULL) { cache_purge(vpdevfs); VI_LOCK(vpdevfs); vn_irflag_unset_locked(vpdevfs, VIRF_MOUNTPOINT); vpdevfs->v_mountedhere = NULL; VI_UNLOCK(vpdevfs); vrele(vpdevfs); } VI_LOCK(vp); mpdevfs->mnt_vnodecovered = vp; vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); vp->v_mountedhere = mpdevfs; VI_UNLOCK(vp); VOP_UNLOCK(vp); } else vput(vp); } if (error) printf("mountroot: unable to remount devfs under /dev " "(error %d)\n", error); NDFREE(&nd, NDF_ONLY_PNBUF); if (mporoot == mpdevfs) { vfs_unbusy(mpdevfs); /* Unlink the no longer needed /dev/dev -> / symlink */ error = kern_funlinkat(td, AT_FDCWD, "/dev/dev", FD_NONE, UIO_SYSSPACE, 0, 0); if (error) printf("mountroot: unable to unlink /dev/dev " "(error %d)\n", error); } } /* * Configuration parser. */ /* Parser character classes. */ #define CC_WHITESPACE -1 #define CC_NONWHITESPACE -2 /* Parse errors. */ #define PE_EOF -1 #define PE_EOL -2 static __inline int parse_peek(char **conf) { return (**conf); } static __inline void parse_poke(char **conf, int c) { **conf = c; } static __inline void parse_advance(char **conf) { (*conf)++; } static int parse_skipto(char **conf, int mc) { int c, match; while (1) { c = parse_peek(conf); if (c == 0) return (PE_EOF); switch (mc) { case CC_WHITESPACE: match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0; break; case CC_NONWHITESPACE: if (c == '\n') return (PE_EOL); match = (c != ' ' && c != '\t') ? 1 : 0; break; default: match = (c == mc) ? 1 : 0; break; } if (match) break; parse_advance(conf); } return (0); } static int parse_token(char **conf, char **tok) { char *p; size_t len; int error; *tok = NULL; error = parse_skipto(conf, CC_NONWHITESPACE); if (error) return (error); p = *conf; error = parse_skipto(conf, CC_WHITESPACE); len = *conf - p; *tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO); bcopy(p, *tok, len); return (0); } static void parse_dir_ask_printenv(const char *var) { char *val; val = kern_getenv(var); if (val != NULL) { printf(" %s=%s\n", var, val); freeenv(val); } } static int parse_dir_ask(char **conf) { char name[80]; char *mnt; int error; vfs_mountroot_wait(); printf("\nLoader variables:\n"); parse_dir_ask_printenv("vfs.root.mountfrom"); parse_dir_ask_printenv("vfs.root.mountfrom.options"); printf("\nManual root filesystem specification:\n"); printf(" : [options]\n"); printf(" Mount using filesystem \n"); printf(" and with the specified (optional) option list.\n"); printf("\n"); printf(" eg. ufs:/dev/da0s1a\n"); printf(" zfs:zroot/ROOT/default\n"); printf(" cd9660:/dev/cd0 ro\n"); printf(" (which is equivalent to: "); printf("mount -t cd9660 -o ro /dev/cd0 /)\n"); printf("\n"); printf(" ? List valid disk boot devices\n"); printf(" . Yield 1 second (for background tasks)\n"); printf(" Abort manual input\n"); do { error = EINVAL; printf("\nmountroot> "); cngets(name, sizeof(name), GETS_ECHO); if (name[0] == '\0') break; if (name[0] == '?' && name[1] == '\0') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (name[0] == '.' && name[1] == '\0') { pause("rmask", hz); continue; } mnt = name; error = parse_mount(&mnt); if (error == -1) printf("Invalid file system specification.\n"); } while (error != 0); return (error); } static int parse_dir_md(char **conf) { struct stat sb; struct thread *td; struct md_ioctl *mdio; char *path, *tok; int error, fd, len; td = curthread; fd = -1; error = parse_token(conf, &tok); if (error) return (error); len = strlen(tok); mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO); path = (void *)(mdio + 1); bcopy(tok, path, len); free(tok, M_TEMP); /* Get file status. */ error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL); if (error) goto out; /* Open /dev/mdctl so that we can attach/detach. */ error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0); if (error) goto out; fd = td->td_retval[0]; mdio->md_version = MDIOVERSION; mdio->md_type = MD_VNODE; if (root_mount_mddev != -1) { mdio->md_unit = root_mount_mddev; (void)kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); /* Ignore errors. We don't care. */ root_mount_mddev = -1; } mdio->md_file = (void *)(mdio + 1); mdio->md_options = MD_AUTOUNIT | MD_READONLY; mdio->md_mediasize = sb.st_size; mdio->md_unit = 0; error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio); if (error) goto out; if (mdio->md_unit > 9) { printf("rootmount: too many md units\n"); mdio->md_file = NULL; mdio->md_options = 0; mdio->md_mediasize = 0; error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); /* Ignore errors. We don't care. */ error = ERANGE; goto out; } root_mount_mddev = mdio->md_unit; printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file); out: if (fd >= 0) (void)kern_close(td, fd); free(mdio, M_TEMP); return (error); } static int parse_dir_onfail(char **conf) { char *action; int error; error = parse_token(conf, &action); if (error) return (error); if (!strcmp(action, "continue")) root_mount_onfail = A_CONTINUE; else if (!strcmp(action, "panic")) root_mount_onfail = A_PANIC; else if (!strcmp(action, "reboot")) root_mount_onfail = A_REBOOT; else if (!strcmp(action, "retry")) root_mount_onfail = A_RETRY; else { printf("rootmount: %s: unknown action\n", action); error = EINVAL; } free(action, M_TEMP); return (0); } static int parse_dir_timeout(char **conf) { char *tok, *endtok; long secs; int error; error = parse_token(conf, &tok); if (error) return (error); secs = strtol(tok, &endtok, 0); error = (secs < 0 || *endtok != '\0') ? EINVAL : 0; if (!error) root_mount_timeout = secs; free(tok, M_TEMP); return (error); } static int parse_directive(char **conf) { char *dir; int error; error = parse_token(conf, &dir); if (error) return (error); if (strcmp(dir, ".ask") == 0) error = parse_dir_ask(conf); else if (strcmp(dir, ".md") == 0) error = parse_dir_md(conf); else if (strcmp(dir, ".onfail") == 0) error = parse_dir_onfail(conf); else if (strcmp(dir, ".timeout") == 0) error = parse_dir_timeout(conf); else { printf("mountroot: invalid directive `%s'\n", dir); /* Ignore the rest of the line. */ (void)parse_skipto(conf, '\n'); error = EINVAL; } free(dir, M_TEMP); return (error); } static int parse_mount_dev_present(const char *dev) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread); error = namei(&nd); if (!error) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error != 0) ? 0 : 1; } #define ERRMSGL 255 static int parse_mount(char **conf) { char *errmsg; struct mntarg *ma; char *dev, *fs, *opts, *tok; int delay, error, timeout; error = parse_token(conf, &tok); if (error) return (error); fs = tok; error = parse_skipto(&tok, ':'); if (error) { free(fs, M_TEMP); return (error); } parse_poke(&tok, '\0'); parse_advance(&tok); dev = tok; if (root_mount_mddev != -1) { /* Handle substitution for the md unit number. */ tok = strstr(dev, "md#"); if (tok != NULL) tok[2] = '0' + root_mount_mddev; } /* Parse options. */ error = parse_token(conf, &tok); opts = (error == 0) ? tok : NULL; printf("Trying to mount root from %s:%s [%s]...\n", fs, dev, (opts != NULL) ? opts : ""); errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO); if (vfs_byname(fs) == NULL) { strlcpy(errmsg, "unknown file system", ERRMSGL); error = ENOENT; goto out; } error = vfs_mountroot_wait_if_neccessary(fs, dev); if (error != 0) goto out; delay = hz / 10; timeout = root_mount_timeout * hz; for (;;) { ma = NULL; ma = mount_arg(ma, "fstype", fs, -1); ma = mount_arg(ma, "fspath", "/", -1); ma = mount_arg(ma, "from", dev, -1); ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL); ma = mount_arg(ma, "ro", NULL, 0); ma = parse_mountroot_options(ma, opts); error = kernel_mount(ma, MNT_ROOTFS); if (error == 0 || timeout <= 0) break; if (root_mount_timeout * hz == timeout || (bootverbose && timeout % hz == 0)) { printf("Mounting from %s:%s failed with error %d; " "retrying for %d more second%s\n", fs, dev, error, timeout / hz, (timeout / hz > 1) ? "s" : ""); } pause("rmretry", delay); timeout -= delay; } out: if (error) { printf("Mounting from %s:%s failed with error %d", fs, dev, error); if (errmsg[0] != '\0') printf(": %s", errmsg); printf(".\n"); } free(fs, M_TEMP); free(errmsg, M_TEMP); if (opts != NULL) free(opts, M_TEMP); /* kernel_mount can return -1 on error. */ return ((error < 0) ? EDOOFUS : error); } #undef ERRMSGL static int vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs) { struct mount *mp; char *conf; int error; root_mount_mddev = -1; retry: conf = sbuf_data(sb); mp = TAILQ_NEXT(mpdevfs, mnt_list); error = (mp == NULL) ? 0 : EDOOFUS; root_mount_onfail = A_CONTINUE; while (mp == NULL) { error = parse_skipto(&conf, CC_NONWHITESPACE); if (error == PE_EOL) { parse_advance(&conf); continue; } if (error < 0) break; switch (parse_peek(&conf)) { case '#': error = parse_skipto(&conf, '\n'); break; case '.': error = parse_directive(&conf); break; default: error = parse_mount(&conf); if (error == -1) { printf("mountroot: invalid file system " "specification.\n"); error = 0; } break; } if (error < 0) break; /* Ignore any trailing garbage on the line. */ if (parse_peek(&conf) != '\n') { printf("mountroot: advancing to next directive...\n"); (void)parse_skipto(&conf, '\n'); } mp = TAILQ_NEXT(mpdevfs, mnt_list); } if (mp != NULL) return (0); /* * We failed to mount (a new) root. */ switch (root_mount_onfail) { case A_CONTINUE: break; case A_PANIC: panic("mountroot: unable to (re-)mount root."); /* NOTREACHED */ case A_RETRY: goto retry; case A_REBOOT: kern_reboot(RB_NOSYNC); /* NOTREACHED */ } return (error); } static void vfs_mountroot_conf0(struct sbuf *sb) { char *s, *tok, *mnt, *opt; int error; sbuf_printf(sb, ".onfail panic\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); if (boothowto & RB_ASKNAME) sbuf_printf(sb, ".ask\n"); #ifdef ROOTDEVNAME if (boothowto & RB_DFLTROOT) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (boothowto & RB_CDROM) { sbuf_printf(sb, "cd9660:/dev/cd0 ro\n"); sbuf_printf(sb, ".timeout 0\n"); sbuf_printf(sb, "cd9660:/dev/cd1 ro\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); } s = kern_getenv("vfs.root.mountfrom"); if (s != NULL) { opt = kern_getenv("vfs.root.mountfrom.options"); tok = s; error = parse_token(&tok, &mnt); while (!error) { sbuf_printf(sb, "%s %s\n", mnt, (opt != NULL) ? opt : ""); free(mnt, M_TEMP); error = parse_token(&tok, &mnt); } if (opt != NULL) freeenv(opt); freeenv(s); } if (rootdevnames[0] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[0]); if (rootdevnames[1] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[1]); #ifdef ROOTDEVNAME if (!(boothowto & RB_DFLTROOT)) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (!(boothowto & RB_ASKNAME)) sbuf_printf(sb, ".ask\n"); } static int vfs_mountroot_readconf(struct thread *td, struct sbuf *sb) { static char buf[128]; struct nameidata nd; off_t ofs; ssize_t resid; int error, flags, len; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); ofs = 0; len = sizeof(buf) - 1; while (1) { error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) break; if (resid == len) break; buf[len - resid] = 0; sbuf_printf(sb, "%s", buf); ofs += len - resid; } VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); return (error); } static void vfs_mountroot_wait(void) { struct root_hold_token *h; struct timeval lastfail; int curfail; TSENTER(); curfail = 0; lastfail.tv_sec = 0; ppsratecheck(&lastfail, &curfail, 1); while (1) { g_waitidle(); mtx_lock(&root_holds_mtx); if (TAILQ_EMPTY(&root_holds)) { mtx_unlock(&root_holds_mtx); break; } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("Root mount waiting for:"); TAILQ_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); } TSWAIT("root mount"); msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold", hz); TSUNWAIT("root mount"); } g_waitidle(); TSEXIT(); } static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev) { int delay, timeout; /* * In case of ZFS and NFS we don't have a way to wait for * specific device. Also do the wait if the user forced that * behaviour by setting vfs.root_mount_always_wait=1. */ if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL || dev[0] == '\0' || root_mount_always_wait != 0) { vfs_mountroot_wait(); return (0); } /* * Otherwise, no point in waiting if the device is already there. * Note that we must wait for GEOM to finish reconfiguring itself, * eg for geom_part(4) to finish tasting. */ g_waitidle(); if (parse_mount_dev_present(dev)) return (0); /* * No luck. Let's wait. This code looks weird, but it's that way * to behave exactly as it used to work before. */ vfs_mountroot_wait(); if (parse_mount_dev_present(dev)) return (0); printf("mountroot: waiting for device %s...\n", dev); delay = hz / 10; timeout = root_mount_timeout * hz; do { pause("rmdev", delay); timeout -= delay; } while (timeout > 0 && !parse_mount_dev_present(dev)); if (timeout <= 0) return (ENODEV); return (0); } void vfs_mountroot(void) { struct mount *mp; struct sbuf *sb; struct thread *td; time_t timebase; int error; mtx_assert(&Giant, MA_NOTOWNED); TSENTER(); td = curthread; sb = sbuf_new_auto(); vfs_mountroot_conf0(sb); sbuf_finish(sb); error = vfs_mountroot_devfs(td, &mp); while (!error) { error = vfs_mountroot_parse(sb, mp); if (!error) { vfs_mountroot_shuffle(td, mp); sbuf_clear(sb); error = vfs_mountroot_readconf(td, sb); sbuf_finish(sb); } } sbuf_delete(sb); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); while (mp != NULL) { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); inittodr(timebase); /* Keep prison0's root in sync with the global rootvnode. */ mtx_lock(&prison0.pr_mtx); prison0.pr_root = rootvnode; vref(prison0.pr_root); mtx_unlock(&prison0.pr_mtx); mtx_lock(&root_holds_mtx); atomic_store_rel_int(&root_mount_complete, 1); wakeup(&root_mount_complete); mtx_unlock(&root_holds_mtx); EVENTHANDLER_INVOKE(mountroot); TSEXIT(); } static struct mntarg * parse_mountroot_options(struct mntarg *ma, const char *options) { char *p; char *name, *name_arg; char *val, *val_arg; char *opts; if (options == NULL || options[0] == '\0') return (ma); p = opts = strdup(options, M_MOUNT); if (opts == NULL) { return (ma); } while((name = strsep(&p, ",")) != NULL) { if (name[0] == '\0') break; val = strchr(name, '='); if (val != NULL) { *val = '\0'; ++val; } if( strcmp(name, "rw") == 0 || strcmp(name, "noro") == 0) { /* * The first time we mount the root file system, * we need to mount 'ro', so We need to ignore * 'rw' and 'noro' mount options. */ continue; } name_arg = strdup(name, M_MOUNT); val_arg = NULL; if (val != NULL) val_arg = strdup(val, M_MOUNT); ma = mount_arg(ma, name_arg, val_arg, (val_arg != NULL ? -1 : 0)); } free(opts, M_MOUNT); return (ma); } diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index b8f2bf4f695c..64ff9b55c57e 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,5016 +1,5015 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information"); static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag); static int setfflags(struct thread *td, struct vnode *, u_long); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int getutimens(const struct timespec *, enum uio_seg, struct timespec *, int *); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp); static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td); static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag); static uint64_t at2cnpflags(u_int at_flags, u_int mask) { u_int64_t res; MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) != (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)); res = 0; at_flags &= mask; if ((at_flags & AT_RESOLVE_BENEATH) != 0) res |= RBENEATH; if ((at_flags & AT_SYMLINK_FOLLOW) != 0) res |= FOLLOW; /* NOFOLLOW is pseudo flag */ if ((mask & AT_SYMLINK_NOFOLLOW) != 0) { res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW : FOLLOW; } if ((mask & AT_EMPTY_PATH) != 0 && (at_flags & AT_EMPTY_PATH) != 0) res |= EMPTYPATH; return (res); } int kern_sync(struct thread *td) { struct mount *mp, *nmp; int save; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); return (0); } /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sys_sync(struct thread *td, struct sync_args *uap) { return (kern_sync(td)); } /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int sys_quotactl(struct thread *td, struct quotactl_args *uap) { struct mount *mp; struct nameidata nd; int error; AUDIT_ARG_CMD(uap->cmd); AUDIT_ARG_UID(uap->uid); if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS)) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; vfs_ref(mp); vput(nd.ni_vp); error = vfs_busy(mp, 0); if (error != 0) { vfs_rel(mp); return (error); } error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg); /* * Since quota on operation typically needs to open quota * file, the Q_QUOTAON handler needs to unbusy the mount point * before calling into namei. Otherwise, unmount might be * started between two vfs_busy() invocations (first is our, * second is from mount point cross-walk code in lookup()), * causing deadlock. * * Require that Q_QUOTAON handles the vfs_busy() reference on * its own, always returning with ubusied mount point. */ if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON && (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF) vfs_unbusy(mp); vfs_rel(mp); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } static int kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf) { int error; if (mp == NULL) return (EBADF); error = vfs_busy(mp, 0); vfs_rel(mp); if (error != 0) return (error); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); if (error != 0) goto out; if (priv_check_cred_vfs_generation(td->td_ucred)) { buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, buf); } out: vfs_unbusy(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int sys_statfs(struct thread *td, struct statfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error != 0) return (error); mp = vfs_ref_from_vp(nd.ni_vp); NDFREE_NOTHING(&nd); vrele(nd.ni_vp); return (kern_do_statfs(td, mp, buf)); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int sys_fstatfs(struct thread *td, struct fstatfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct vnode *vp; int error; AUDIT_ARG_FD(fd); error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); } #endif mp = vfs_ref_from_vp(vp); fdrop(fp, td); return (kern_do_statfs(td, mp, buf)); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int mode; }; #endif int sys_getfsstat(struct thread *td, struct getfsstat_args *uap) { size_t count; int error; if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX) return (EINVAL); error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count, UIO_USERSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; return (error); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, size_t *countp, enum uio_seg bufseg, int mode) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, *sptmp, *tofree; size_t count, maxcount; int error; switch (mode) { case MNT_WAIT: case MNT_NOWAIT: break; default: if (bufseg == UIO_SYSSPACE) *buf = NULL; return (EINVAL); } restart: maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) { sfsp = NULL; tofree = NULL; } else if (bufseg == UIO_USERSPACE) { sfsp = *buf; tofree = NULL; } else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_STATFS, M_WAITOK); } count = 0; /* * If there is no target buffer they only want the count. * * This could be TAILQ_FOREACH but it is open-coded to match the original * code below. */ if (sfsp == NULL) { mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif count++; nmp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); *countp = count; return (0); } /* * They want the entire thing. * * Short-circuit the corner case of no room for anything, avoids * relocking below. */ if (maxcount < 1) { goto out; } mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (mode == MNT_WAIT) { if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) { /* * If vfs_busy() failed, and MBF_NOWAIT * wasn't passed, then the mp is gone. * Furthermore, because of MBF_MNTLSTLOCK, * the mountlist_mtx was dropped. We have * no other choice than to start over. */ mtx_unlock(&mountlist_mtx); free(tofree, M_STATFS); goto restart; } } else { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } } sp = &mp->mnt_stat; /* * If MNT_NOWAIT is specified, do not refresh * the fsstat cache. */ if (mode != MNT_NOWAIT) { error = VFS_STATFS(mp, sp); if (error != 0) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); continue; } } if (priv_check_cred_vfs_generation(td->td_ucred)) { sptmp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); *sptmp = *sp; sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, sptmp); sp = sptmp; } else sptmp = NULL; if (bufseg == UIO_SYSSPACE) { bcopy(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); } else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); if (error != 0) { vfs_unbusy(mp); return (error); } } sfsp++; count++; if (count == maxcount) { vfs_unbusy(mp); goto out; } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); out: *countp = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int mode; }; #endif int freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap) { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; if (uap->bufsize < 0) return (EINVAL); count = uap->bufsize / sizeof(struct ostatfs); if (count > SIZE_MAX / sizeof(struct statfs)) return (EINVAL); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size != 0) { sp = buf; while (count != 0 && error == 0) { freebsd4_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp) { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ #if defined(COMPAT_FREEBSD11) /* * Get old format filesystem statistics. */ static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *); int freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ int freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ int freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap) { struct freebsd11_statfs osb; struct statfs *buf, *sp; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size > 0) { sp = buf; while (count > 0 && error == 0) { freebsd11_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ int freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp) { bzero(osp, sizeof(*osp)); osp->f_version = FREEBSD11_STATFS_VERSION; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_bsize = nsp->f_bsize; osp->f_iosize = nsp->f_iosize; osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = nsp->f_files; osp->f_ffree = nsp->f_ffree; osp->f_syncwrites = nsp->f_syncwrites; osp->f_asyncwrites = nsp->f_asyncwrites; osp->f_syncreads = nsp->f_syncreads; osp->f_asyncreads = nsp->f_asyncreads; osp->f_namemax = nsp->f_namemax; osp->f_owner = nsp->f_owner; osp->f_fsid = nsp->f_fsid; strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, sizeof(osp->f_fstypename))); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, sizeof(osp->f_mntonname))); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, sizeof(osp->f_mntfromname))); } #endif /* COMPAT_FREEBSD11 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int sys_fchdir(struct thread *td, struct fchdir_args *uap) { struct vnode *vp, *tdp; struct mount *mp; struct file *fp; int error; AUDIT_ARG_FD(uap->fd); error = getvnode_path(td, uap->fd, &cap_fchdir_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; vrefact(vp); fdrop(fp, td); vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0)) continue; error = VFS_ROOT(mp, LK_SHARED, &tdp); vfs_unbusy(mp); if (error != 0) break; vput(vp); vp = tdp; } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp); pwd_chdir(td, vp); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int sys_chdir(struct thread *td, struct chdir_args *uap) { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); NDFREE_NOTHING(&nd); return (error); } VOP_UNLOCK(nd.ni_vp); NDFREE_NOTHING(&nd); pwd_chdir(td, nd.ni_vp); return (0); } static int unprivileged_chroot = 0; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_chroot, CTLFLAG_RW, &unprivileged_chroot, 0, "Unprivileged processes can use chroot(2)"); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int sys_chroot(struct thread *td, struct chroot_args *uap) { struct nameidata nd; struct proc *p; int error; error = priv_check(td, PRIV_VFS_CHROOT); if (error != 0) { p = td->td_proc; PROC_LOCK(p); if (unprivileged_chroot == 0 || (p->p_flag2 & P2_NO_NEW_PRIVS) == 0) { PROC_UNLOCK(p); return (error); } PROC_UNLOCK(p); } NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) goto error; error = change_dir(nd.ni_vp, td); if (error != 0) goto e_vunlock; #ifdef MAC error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp); if (error != 0) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp); error = pwd_chroot(td, nd.ni_vp); vrele(nd.ni_vp); NDFREE_NOTHING(&nd); return (error); e_vunlock: vput(nd.ni_vp); error: NDFREE_NOTHING(&nd); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(struct vnode *vp, struct thread *td) { #ifdef MAC int error; #endif ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error != 0) return (error); #endif return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td)); } static __inline void flags_to_rights(int flags, cap_rights_t *rightsp) { if (flags & O_EXEC) { cap_rights_set_one(rightsp, CAP_FEXECVE); if (flags & O_PATH) return; } else { switch ((flags & O_ACCMODE)) { case O_RDONLY: cap_rights_set_one(rightsp, CAP_READ); break; case O_RDWR: cap_rights_set_one(rightsp, CAP_READ); /* FALLTHROUGH */ case O_WRONLY: cap_rights_set_one(rightsp, CAP_WRITE); if (!(flags & (O_APPEND | O_TRUNC))) cap_rights_set_one(rightsp, CAP_SEEK); break; } } if (flags & O_CREAT) cap_rights_set_one(rightsp, CAP_CREATE); if (flags & O_TRUNC) cap_rights_set_one(rightsp, CAP_FTRUNCATE); if (flags & (O_SYNC | O_FSYNC)) cap_rights_set_one(rightsp, CAP_FSYNC); if (flags & (O_EXLOCK | O_SHLOCK)) cap_rights_set_one(rightsp, CAP_FLOCK); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int sys_open(struct thread *td, struct open_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct openat_args { int fd; char *path; int flag; int mode; }; #endif int sys_openat(struct thread *td, struct openat_args *uap) { AUDIT_ARG_FD(uap->fd); return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->mode)); } int kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp; struct pwddesc *pdp; struct file *fp; struct vnode *vp; struct nameidata nd; cap_rights_t rights; int cmode, error, indx; indx = -1; fdp = p->p_fd; pdp = p->p_pd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); cap_rights_init_one(&rights, CAP_LOOKUP); flags_to_rights(flags, &rights); /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags * may be specified. On the other hand, for O_PATH any mode * except O_EXEC is ignored. */ if ((flags & O_PATH) != 0) { flags &= ~(O_CREAT | O_ACCMODE); } else if ((flags & O_EXEC) != 0) { if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) { return (EINVAL); } else { flags = FFLAGS(flags); } /* * Allocate a file structure. The descriptor to reference it * is allocated and used by finstall_refed() below. */ error = falloc_noinstall(td, &fp); if (error != 0) return (error); /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | WANTIOCTLCAPS, pathseg, path, fd, &rights, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open_cred(&nd, &flags, cmode, VN_OPEN_WANTIOCTLCAPS, td->td_ucred, fp); if (error != 0) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { MPASS((flags & O_PATH) == 0); goto success; } /* * Handle special fdopen() case. bleh. * * Don't do this for relative (capability) lookups; we don't * understand exactly what would happen, and we don't think * that it ever should. */ if ((nd.ni_resflags & NIRES_STRICTREL) == 0 && (error == ENODEV || error == ENXIO) && td->td_dupfd >= 0) { error = dupfdopen(td, fdp, td->td_dupfd, flags, error, &indx); if (error == 0) goto success; } goto bad; } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * Store the vnode, for any f_type. Typically, the vnode use * count is decremented by direct call to vn_closefile() for * files that switched type in the cdevsw fdopen() method. */ fp->f_vnode = vp; /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0, ("Unexpected fifo fp %p vp %p", fp, vp)); if ((flags & O_PATH) != 0) { finit(fp, (flags & FMASK) | (fp->f_flag & FKQALLOWED), DTYPE_VNODE, NULL, &path_fileops); } else { finit_vnode(fp, flags, NULL, &vnops); } } VOP_UNLOCK(vp); if (flags & O_TRUNC) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } success: /* * If we haven't already installed the FD (for dupfdopen), do so now. */ if (indx == -1) { struct filecaps *fcaps; #ifdef CAPABILITIES if ((nd.ni_resflags & NIRES_STRICTREL) != 0) fcaps = &nd.ni_filecaps; else #endif fcaps = NULL; error = finstall_refed(td, fp, &indx, flags, fcaps); /* On success finstall_refed() consumes fcaps. */ if (error != 0) { goto bad; } } else { NDFREE_IOCTLCAPS(&nd); falloc_abort(td, fp); } td->td_retval[0] = indx; return (0); bad: KASSERT(indx == -1, ("indx=%d, should be -1", indx)); NDFREE_IOCTLCAPS(&nd); falloc_abort(td, fp); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(struct thread *td, struct ocreat_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknodat_args { int fd; char *path; mode_t mode; dev_t dev; }; #endif int sys_mknodat(struct thread *td, struct mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #if defined(COMPAT_FREEBSD11) int freebsd11_mknod(struct thread *td, struct freebsd11_mknod_args *uap) { return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int freebsd11_mknodat(struct thread *td, struct freebsd11_mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #endif /* COMPAT_FREEBSD11 */ int kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode, dev_t dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; struct nameidata nd; int error, whiteout = 0; AUDIT_ARG_MODE(mode); AUDIT_ARG_DEV(dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); if (error == 0 && dev == VNOVAL) error = EINVAL; break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; case S_IFIFO: if (dev == 0) return (kern_mkfifoat(td, fd, path, pathseg, mode)); /* FALLTHROUGH */ default: error = EINVAL; break; } if (error != 0) return (error); NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, &cap_mknodat_rights, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); return (EEXIST); } else { VATTR_NULL(&vattr); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask; vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); } } VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int sys_mkfifo(struct thread *td, struct mkfifo_args *uap) { return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkfifoat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap) { return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifoat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int sys_link(struct thread *td, struct link_args *uap) { return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link, UIO_USERSPACE, AT_SYMLINK_FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct linkat_args { int fd1; char *path1; int fd2; char *path2; int flag; }; #endif int sys_linkat(struct thread *td, struct linkat_args *uap) { int flag; flag = uap->flag; if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2, UIO_USERSPACE, flag)); } int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } return (0); } int kern_linkat(struct thread *td, int fd1, int fd2, const char *path1, const char *path2, enum uio_seg segflag, int flag) { struct nameidata nd; int error; NDPREINIT(&nd); do { bwillwrite(); NDINIT_ATRIGHTS(&nd, LOOKUP, AUDITVNODE1 | at2cnpflags(flag, AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH), segflag, path1, fd1, &cap_linkat_source_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); if ((nd.ni_resflags & NIRES_EMPTYPATH) != 0) { error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) { vrele(nd.ni_vp); return (error); } } error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); } while (error == EAGAIN || error == ERELOOKUP); return (error); } static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag) { struct nameidata nd; struct mount *mp; int error; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd, &cap_linkat_target_rights, td); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); vrele(vp); return (EEXIST); } else if (nd.ni_dvp->v_mount != vp->v_mount) { /* * Cross-device link. No need to recheck * vp->v_type, since it cannot change, except * to VBAD. */ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vrele(vp); return (EXDEV); } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) { error = can_hardlink(vp, td->td_ucred); #ifdef MAC if (error == 0) error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); #endif if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } error = vn_start_write(vp, &mp, V_NOWAIT); if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); return (EAGAIN); } error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_VPUT_PAIR(nd.ni_dvp, &vp, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vp = NULL; } else { vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); return (EAGAIN); } } if (vp != NULL) vrele(vp); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int sys_symlink(struct thread *td, struct symlink_args *uap) { return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct symlinkat_args { char *path; int fd; char *path2; }; #endif int sys_symlinkat(struct thread *td, struct symlinkat_args *uap) { return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2, UIO_USERSPACE)); } int kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; const char *syspath; char *tmppath; struct nameidata nd; int error; if (segflg == UIO_SYSSPACE) { syspath = path1; } else { tmppath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0) goto out; syspath = tmppath; } AUDIT_ARG_TEXT(syspath); NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, segflg, path2, fd, &cap_symlinkat_rights, td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); nd.ni_vp = NULL; error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask; #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out2; #endif error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); #ifdef MAC out2: #endif VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, tmppath); return (error); } /* * Delete a whiteout from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct undelete_args { char *path; }; #endif int sys_undelete(struct thread *td, struct undelete_args *uap) { struct mount *mp; struct nameidata nd; int error; NDPREINIT(&nd); restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int sys_unlink(struct thread *td, struct unlink_args *uap) { return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0, 0)); } static int kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd, int flag, enum uio_seg pathseg, ino_t oldinum) { if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); if ((flag & AT_REMOVEDIR) != 0) return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0)); return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct unlinkat_args { int fd; char *path; int flag; }; #endif int sys_unlinkat(struct thread *td, struct unlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag, UIO_USERSPACE, 0)); } #ifndef _SYS_SYSPROTO_H_ struct funlinkat_args { int dfd; const char *path; int fd; int flag; }; #endif int sys_funlinkat(struct thread *td, struct funlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag, UIO_USERSPACE, 0)); } int kern_funlinkat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag, ino_t oldinum) { struct mount *mp; struct file *fp; struct vnode *vp; struct nameidata nd; struct stat sb; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode_path(td, fd, &cap_no_rights, &fp); if (error != 0) return (error); } NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights, td); if ((error = namei(&nd)) != 0) { if (error == EINVAL) error = EPERM; goto fdout; } vp = nd.ni_vp; if (vp->v_type == VDIR && oldinum == 0) { error = EPERM; /* POSIX */ } else if (oldinum != 0 && ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) && sb.st_ino != oldinum) { error = EIDRM; /* Identifier removed */ } else if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; } else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) { goto fdout; } goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if (error == ERELOOKUP) goto restart; fdout: if (fp != NULL) fdrop(fp, td); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int sys_lseek(struct thread *td, struct lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } int kern_lseek(struct thread *td, int fd, off_t offset, int whence) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_seek_rights, &fp); if (error != 0) return (error); error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ? fo_seek(fp, offset, whence, td) : ESPIPE; fdrop(fp, td); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(struct thread *td, struct olseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Version with the 'pad' argument */ int freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* * Check access permissions using passed credentials. */ static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td) { accmode_t accmode; int error; /* Flags == 0 means only check for existence. */ if (user_flags == 0) return (0); accmode = 0; if (user_flags & R_OK) accmode |= VREAD; if (user_flags & W_OK) accmode |= VWRITE; if (user_flags & X_OK) accmode |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, accmode); if (error != 0) return (error); #endif if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, accmode, cred, td); return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int amode; }; #endif int sys_access(struct thread *td, struct access_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0, uap->amode)); } #ifndef _SYS_SYSPROTO_H_ struct faccessat_args { int dirfd; char *path; int amode; int flag; } #endif int sys_faccessat(struct thread *td, struct faccessat_args *uap) { return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->amode)); } int kern_accessat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flag, int amode) { struct ucred *cred, *usecred; struct vnode *vp; struct nameidata nd; int error; if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0) return (EINVAL); /* * Create and modify a temporary credential instead of one that * is potentially shared (if we need one). */ cred = td->td_ucred; if ((flag & AT_EACCESS) == 0 && ((cred->cr_uid != cred->cr_ruid || cred->cr_rgid != cred->cr_groups[0]))) { usecred = crdup(cred); usecred->cr_uid = cred->cr_ruid; usecred->cr_groups[0] = cred->cr_rgid; td->td_ucred = usecred; } else usecred = cred; AUDIT_ARG_VALUE(amode); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_EMPTY_PATH), pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; error = vn_access(vp, amode, usecred, td); NDFREE_NOTHING(&nd); vput(vp); out: if (usecred != cred) { td->td_ucred = cred; crfree(usecred); } return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int amode; }; #endif int sys_eaccess(struct thread *td, struct eaccess_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, AT_EACCESS, uap->amode)); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(struct thread *td, struct ostat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(struct thread *td, struct olstat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Convert from an old to a new stat structure. * XXX: many values are blindly truncated. */ void cvtstat(struct stat *st, struct ostat *ost) { bzero(ost, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; ost->st_size = MIN(st->st_size, INT32_MAX); ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int ino64_trunc_error; SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW, &ino64_trunc_error, 0, "Error on truncation of device, file or inode number, or link count"); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost) { ost->st_dev = st->st_dev; if (ost->st_dev != st->st_dev) { switch (ino64_trunc_error) { default: /* * Since dev_t is almost raw, don't clamp to the * maximum for case 2, but ignore the error. */ break; case 1: return (EOVERFLOW); } } ost->st_ino = st->st_ino; if (ost->st_ino != st->st_ino) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_ino = UINT32_MAX; break; } } ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; if (ost->st_nlink != st->st_nlink) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_nlink = UINT16_MAX; break; } } ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (ost->st_rdev != st->st_rdev) { switch (ino64_trunc_error) { default: break; case 1: return (EOVERFLOW); } } ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_size = st->st_size; ost->st_blocks = st->st_blocks; ost->st_blksize = st->st_blksize; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; ost->st_lspare = 0; ost->st_birthtim = st->st_birthtim; bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim), sizeof(*ost) - offsetof(struct freebsd11_stat, st_birthtim) - sizeof(ost->st_birthtim)); return (0); } int freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap) { struct fhandle fh; struct stat sb; struct freebsd11_stat osb; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } int freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->buf, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Get file status */ #ifndef _SYS_SYSPROTO_H_ struct fstatat_args { int fd; char *path; struct stat *buf; int flag; } #endif int sys_fstatat(struct thread *td, struct fstatat_args *uap) { struct stat sb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error == 0) error = copyout(&sb, uap->buf, sizeof (sb)); return (error); } int kern_statat(struct thread *td, int flag, int fd, const char *path, enum uio_seg pathseg, struct stat *sbp, void (*hook)(struct vnode *vp, struct stat *sbp)) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) { if (error == ENOTDIR && (nd.ni_resflags & NIRES_EMPTYPATH) != 0) error = kern_fstat(td, fd, sbp); return (error); } error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td); if (error == 0) { if (__predict_false(hook != NULL)) hook(nd.ni_vp, sbp); } NDFREE_NOTHING(&nd); vput(nd.ni_vp); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Implementation of the NetBSD [l]stat() functions. */ void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb) { bzero(nsb, sizeof(*nsb)); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atim = sb->st_atim; nsb->st_mtim = sb->st_mtim; nsb->st_ctim = sb->st_ctim; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtim = sb->st_birthtim; } #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nlstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } #endif /* COMPAT_FREEBSD11 */ /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int sys_pathconf(struct thread *td, struct pathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } #ifndef _SYS_SYSPROTO_H_ struct lpathconf_args { char *path; int name; }; #endif int sys_lpathconf(struct thread *td, struct lpathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg, int name, u_long flags, long *valuep) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = VOP_PATHCONF(nd.ni_vp, name, valuep); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; size_t count; }; #endif int sys_readlink(struct thread *td, struct readlink_args *uap) { return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } #ifndef _SYS_SYSPROTO_H_ struct readlinkat_args { int fd; char *path; char *buf; size_t bufsize; }; #endif int sys_readlinkat(struct thread *td, struct readlinkat_args *uap) { return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->bufsize)); } int kern_readlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count) { struct vnode *vp; struct nameidata nd; int error; if (count > IOSIZE_MAX) return (EINVAL); NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | EMPTYPATH, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); vp = nd.ni_vp; error = kern_readlink_vp(vp, buf, bufseg, count, td); vput(vp); return (error); } /* * Helper function to readlink from a vnode */ static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td) { struct iovec aiov; struct uio auio; int error; ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked"); #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error != 0) return (error); #endif if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0) return (EINVAL); aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(struct thread *td, struct vnode *vp, u_long flags) { struct mount *mp; struct vattr vattr; int error; /* We can't support the value matching VNOVAL. */ if (flags == VNOVAL) return (EOPNOTSUPP); /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error != 0) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VATTR_NULL(&vattr); vattr.va_flags = flags; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { const char *path; u_long flags; }; #endif int sys_chflags(struct thread *td, struct chflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, 0)); } #ifndef _SYS_SYSPROTO_H_ struct chflagsat_args { int fd; const char *path; u_long flags; int atflag; } #endif int sys_chflagsat(struct thread *td, struct chflagsat_args *uap) { if ((uap->atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flags, uap->atflag)); } /* * Same as chflags() but doesn't follow symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchflags_args { const char *path; u_long flags; }; #endif int sys_lchflags(struct thread *td, struct lchflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, AT_SYMLINK_NOFOLLOW)); } static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag) { struct nameidata nd; int error; AUDIT_ARG_FFLAGS(flags); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchflags_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfflags(td, nd.ni_vp, flags); vrele(nd.ni_vp); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; u_long flags; }; #endif int sys_fchflags(struct thread *td, struct fchflags_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_FFLAGS(uap->flags); error = getvnode(td, uap->fd, &cap_fchflags_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setfflags(td, fp->f_vnode, uap->flags); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(cred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int sys_chmod(struct thread *td, struct chmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchmodat_args { int dirfd; char *path; mode_t mode; int flag; } #endif int sys_fchmodat(struct thread *td, struct fchmodat_args *uap) { if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->flag)); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int sys_lchmod(struct thread *td, struct lchmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, AT_SYMLINK_NOFOLLOW)); } int kern_fchmodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, mode_t mode, int flag) { struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchmod_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfmode(td, td->td_ucred, nd.ni_vp, mode); vrele(nd.ni_vp); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int sys_fchmod(struct thread *td, struct fchmod_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_MODE(uap->mode); error = fget(td, uap->fd, &cap_fchmod_rights, &fp); if (error != 0) return (error); error = fo_chmod(fp, uap->mode, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(cred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int sys_chown(struct thread *td, struct chown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchownat_args { int fd; const char * path; uid_t uid; gid_t gid; int flag; }; #endif int sys_fchownat(struct thread *td, struct fchownat_args *uap) { if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid, uap->gid, uap->flag)); } int kern_fchownat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int uid, int gid, int flag) { struct nameidata nd; int error; AUDIT_ARG_OWNER(uid, gid); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchown_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int sys_lchown(struct thread *td, struct lchown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW)); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int sys_fchown(struct thread *td, struct fchown_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_OWNER(uap->uid, uap->gid); error = fget(td, uap->fd, &cap_fchown_rights, &fp); if (error != 0) return (error); error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg, struct timespec *tsp) { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { vfs_timestamp(&tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for futimens(), utimensat(). */ #define UTIMENS_NULL 0x1 #define UTIMENS_EXIT 0x2 static int getutimens(const struct timespec *usrtsp, enum uio_seg tspseg, struct timespec *tsp, int *retflags) { struct timespec tsnow; int error; vfs_timestamp(&tsnow); *retflags = 0; if (usrtsp == NULL) { tsp[0] = tsnow; tsp[1] = tsnow; *retflags |= UTIMENS_NULL; return (0); } if (tspseg == UIO_SYSSPACE) { tsp[0] = usrtsp[0]; tsp[1] = usrtsp[1]; } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0) return (error); if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT) *retflags |= UTIMENS_EXIT; if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW) *retflags |= UTIMENS_NULL; if (tsp[0].tv_nsec == UTIME_OMIT) tsp[0].tv_sec = VNOVAL; else if (tsp[0].tv_nsec == UTIME_NOW) tsp[0] = tsnow; else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L) return (EINVAL); if (tsp[1].tv_nsec == UTIME_OMIT) tsp[1].tv_sec = VNOVAL; else if (tsp[1].tv_nsec == UTIME_NOW) tsp[1] = tsnow; else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L) return (EINVAL); return (0); } /* * Common implementation code for utimes(), lutimes(), futimes(), futimens(), * and utimensat(). */ static int setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts, int numtimes, int nullflag) { struct mount *mp; struct vattr vattr; int error; bool setbirthtime; setbirthtime = false; vattr.va_birthtime.tv_sec = VNOVAL; vattr.va_birthtime.tv_nsec = 0; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = true; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int sys_utimes(struct thread *td, struct utimes_args *uap) { return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct futimesat_args { int fd; const char * path; const struct timeval * times; }; #endif int sys_futimesat(struct thread *td, struct futimesat_args *uap) { return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE)); } int kern_utimesat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct nameidata nd; struct timespec ts[2]; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int sys_lutimes(struct thread *td, struct lutimes_args *uap) { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct nameidata nd; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int sys_futimes(struct thread *td, struct futimes_args *uap) { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error; AUDIT_ARG_FD(fd); error = getutimes(tptr, tptrseg, ts); if (error != 0) return (error); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); fdrop(fp, td); return (error); } int sys_futimens(struct thread *td, struct futimens_args *uap) { return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE)); } int kern_futimens(struct thread *td, int fd, struct timespec *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error, flags; AUDIT_ARG_FD(fd); error = getutimens(tptr, tptrseg, ts, &flags); if (error != 0) return (error); if (flags & UTIMENS_EXIT) return (0); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL); fdrop(fp, td); return (error); } int sys_utimensat(struct thread *td, struct utimensat_args *uap) { return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE, uap->flag)); } int kern_utimensat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg, int flag) { struct nameidata nd; struct timespec ts[2]; int error, flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights, td); if ((error = namei(&nd)) != 0) return (error); /* * We are allowed to call namei() regardless of 2xUTIME_OMIT. * POSIX states: * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected." * "Search permission is denied by a component of the path prefix." */ NDFREE_NOTHING(&nd); if ((flags & UTIMENS_EXIT) == 0) error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL); vrele(nd.ni_vp); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int sys_truncate(struct thread *td, struct truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; void *rl_cookie; struct vattr vattr; struct nameidata nd; int error; if (length < 0) return (EINVAL); NDPREINIT(&nd); retry: NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vn_rangelock_unlock(vp, rl_cookie); vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred); } VOP_UNLOCK(vp); vn_finished_write(mp); vn_rangelock_unlock(vp, rl_cookie); vrele(vp); if (error == ERELOOKUP) goto retry; return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(struct thread *td, struct otruncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif int kern_fsync(struct thread *td, int fd, bool fullsync) { struct vnode *vp; struct mount *mp; struct file *fp; int error; AUDIT_ARG_FD(fd); error = getvnode(td, fd, &cap_fsync_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #if 0 if (!fullsync) /* XXXKIB: compete outstanding aio writes */; #endif retry: error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); VOP_UNLOCK(vp); vn_finished_write(mp); if (error == ERELOOKUP) goto retry; drop: fdrop(fp, td); return (error); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int sys_fsync(struct thread *td, struct fsync_args *uap) { return (kern_fsync(td, uap->fd, true)); } int sys_fdatasync(struct thread *td, struct fdatasync_args *uap) { return (kern_fsync(td, uap->fd, false)); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int sys_rename(struct thread *td, struct rename_args *uap) { return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD, uap->to, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct renameat_args { int oldfd; char *old; int newfd; char *new; }; #endif int sys_renameat(struct thread *td, struct renameat_args *uap) { return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new, UIO_USERSPACE)); } #ifdef MAC static int kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg, struct nameidata *fromnd) { int error; NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td); if ((error = namei(fromnd)) != 0) return (error); error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp, fromnd->ni_vp, &fromnd->ni_cnd); VOP_UNLOCK(fromnd->ni_dvp); if (fromnd->ni_dvp != fromnd->ni_vp) VOP_UNLOCK(fromnd->ni_vp); if (error != 0) { NDFREE(fromnd, NDF_ONLY_PNBUF); vrele(fromnd->ni_dvp); vrele(fromnd->ni_vp); if (fromnd->ni_startdir) vrele(fromnd->ni_startdir); } return (error); } #endif int kern_renameat(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; u_int64_t tondflags; int error; again: bwillwrite(); #ifdef MAC if (mac_vnode_check_rename_from_enabled()) { error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg, &fromnd); if (error != 0) return (error); } else { #endif NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td); if ((error = namei(&fromnd)) != 0) return (error); #ifdef MAC } #endif fvp = fromnd.ni_vp; tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2; if (fromnd.ni_vp->v_type == VDIR) tondflags |= WILLBEDIR; NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd, &cap_renameat_target_rights, td); if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; error = vn_start_write(fvp, &mp, V_NOWAIT); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); vrele(tond.ni_startdir); if (fromnd.ni_startdir != NULL) vrele(fromnd.ni_startdir); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); goto again; } if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef CAPABILITIES if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) { /* * If the target already exists we require CAP_UNLINKAT * from 'newfd', when newfd was used for the lookup. */ error = cap_check(&tond.ni_filecaps.fc_rights, &cap_unlinkat_rights); if (error != 0) goto out; } #endif } if (fvp == tdvp) { error = EINVAL; goto out; } /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = ERESTART; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (error == 0) { error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == ERESTART) return (0); if (error == ERELOOKUP) goto again; return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int sys_mkdir(struct thread *td, struct mkdir_args *uap) { return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkdirat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkdirat(struct thread *td, struct mkdirat_args *uap) { return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR, segflg, path, fd, &cap_mkdirat_rights, td); if ((error = namei(&nd)) != 0) return (error); if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int sys_rmdir(struct thread *td, struct rmdir_args *uap) { return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0)); } int kern_frmdirat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag) { struct mount *mp; struct vnode *vp; struct file *fp; struct nameidata nd; cap_rights_t rights; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode(td, fd, cap_rights_init_one(&rights, CAP_LOOKUP), &fp); if (error != 0) return (error); } NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights, td); if ((error = namei(&nd)) != 0) goto fdout; vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto fdout; goto restart; } vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (error == ERELOOKUP) goto restart; fdout: if (fp != NULL) fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count, long *basep, void (*func)(struct freebsd11_dirent *)) { struct freebsd11_dirent dstdp; struct dirent *dp, *edp; char *dirbuf; off_t base; ssize_t resid, ucount; int error; /* XXX arbitrary sanity limit on `count'. */ count = min(count, 64 * 1024); dirbuf = malloc(count, M_TEMP, M_WAITOK); error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid, UIO_SYSSPACE); if (error != 0) goto done; if (basep != NULL) *basep = base; ucount = 0; for (dp = (struct dirent *)dirbuf, edp = (struct dirent *)&dirbuf[count - resid]; ucount < count && dp < edp; ) { if (dp->d_reclen == 0) break; MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0)); if (dp->d_namlen >= sizeof(dstdp.d_name)) continue; dstdp.d_type = dp->d_type; dstdp.d_namlen = dp->d_namlen; dstdp.d_fileno = dp->d_fileno; /* truncate */ if (dstdp.d_fileno != dp->d_fileno) { switch (ino64_trunc_error) { default: case 0: break; case 1: error = EOVERFLOW; goto done; case 2: dstdp.d_fileno = UINT32_MAX; break; } } dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) + ((dp->d_namlen + 1 + 3) &~ 3); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); bzero(dstdp.d_name + dstdp.d_namlen, dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) - dstdp.d_namlen); MPASS(dstdp.d_reclen <= dp->d_reclen); MPASS(ucount + dstdp.d_reclen <= count); if (func != NULL) func(&dstdp); error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen); if (error != 0) break; dp = (struct dirent *)((char *)dp + dp->d_reclen); ucount += dstdp.d_reclen; } done: free(dirbuf, M_TEMP); if (error == 0) td->td_retval[0] = ucount; return (error); } #endif /* COMPAT */ #ifdef COMPAT_43 static void ogetdirentries_cvt(struct freebsd11_dirent *dp) { #if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; #else /* * The dp->d_type is the high byte of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; #endif } /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(struct thread *td, struct ogetdirentries_args *uap) { long loff; int error; error = kern_ogetdirentries(td, uap, &loff); if (error == 0) error = copyout(&loff, uap->basep, sizeof(long)); return (error); } int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, long *ploff) { long base; int error; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, ogetdirentries_cvt); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) #ifndef _SYS_SYSPROTO_H_ struct freebsd11_getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int freebsd11_getdirentries(struct thread *td, struct freebsd11_getdirentries_args *uap) { long base; int error; error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } int freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap) { struct freebsd11_getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (freebsd11_getdirentries(td, &ap)); } #endif /* COMPAT_FREEBSD11 */ /* * Read a block of directory entries in a filesystem independent format. */ int sys_getdirentries(struct thread *td, struct getdirentries_args *uap) { off_t base; int error; error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL, UIO_USERSPACE); if (error != 0) return (error); if (uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(off_t)); return (error); } int kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, off_t *basep, ssize_t *residp, enum uio_seg bufseg) { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; off_t loff; int error, eofflag; off_t foffset; AUDIT_ARG_FD(fd); if (count > IOSIZE_MAX) return (EINVAL); auio.uio_resid = count; error = getvnode(td, fd, &cap_read_rights, &fp); if (error != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; foffset = foffset_lock(fp, 0); unionread: if (vp->v_type != VDIR) { error = EINVAL; goto fail; } if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) { error = ENOENT; goto fail; } aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); loff = auio.uio_offset = foffset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); foffset = auio.uio_offset; if (error != 0) { VOP_UNLOCK(vp); goto fail; } if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; foffset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp); *basep = loff; if (residp != NULL) *residp = auio.uio_resid; td->td_retval[0] = count - auio.uio_resid; fail: foffset_unlock(fp, foffset, 0); fdrop(fp, td); return (error); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int sys_umask(struct thread *td, struct umask_args *uap) { struct pwddesc *pdp; pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); td->td_retval[0] = pdp->pd_cmask; pdp->pd_cmask = uap->newmask & ALLPERMS; PWDDESC_XUNLOCK(pdp); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int sys_revoke(struct thread *td, struct revoke_args *uap) { struct vnode *vp; struct vattr vattr; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE_NOTHING(&nd); if (vp->v_type != VCHR || vp->v_rdev == NULL) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error != 0) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto out; } if (devfs_usecount(vp) > 0) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); return (error); } /* * This variant of getvnode() allows O_PATH files. Caller should * ensure that returned file and vnode are only used for compatible * semantics. */ int getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { struct file *fp; int error; error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp); if (error != 0) return (error); /* * The file could be not of the vnode type, or it may be not * yet fully initialized, in which case the f_vnode pointer * may be set, but f_ops is still badfileops. E.g., * devfs_open() transiently create such situation to * facilitate csw d_fdopen(). * * Dupfdopen() handling in kern_openat() installs the * half-baked file into the process descriptor table, allowing * other thread to dereference it. Guard against the race by * checking f_ops. */ if (fp->f_vnode == NULL || fp->f_ops == &badfileops) { fdrop(fp, td); *fpp = NULL; return (EINVAL); } *fpp = fp; return (0); } /* * Convert a user file descriptor to a kernel file entry and check * that, if it is a capability, the correct rights are present. * A reference on the file entry is held upon returning. */ int getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { int error; error = getvnode_path(td, fd, rightsp, fpp); /* * Filter out O_PATH file descriptors, most getvnode() callers * do not call fo_ methods. */ if (error == 0 && (*fpp)->f_ops == &path_fileops) { fdrop(*fpp, td); *fpp = NULL; error = EBADF; } return (error); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_lgetfh(struct thread *td, struct lgetfh_args *uap) { return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_getfh(struct thread *td, struct getfh_args *uap) { return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } /* * syscall for the rpc.lockd to use to translate an open descriptor into * a NFS file handle. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct getfhat_args { int fd; char *path; fhandle_t *fhp; int flags; }; #endif int sys_getfhat(struct thread *td, struct getfhat_args *uap) { if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } int kern_getfhat(struct thread *td, int flags, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg) { struct nameidata nd; fhandle_t fh; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, td); error = namei(&nd); if (error != 0) return (error); NDFREE_NOTHING(&nd); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error == 0) { if (fhseg == UIO_USERSPACE) error = copyout(&fh, fhp, sizeof (fh)); else memcpy(fhp, &fh, sizeof(fh)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhlink_args { fhandle_t *fhp; const char *to; }; #endif int sys_fhlink(struct thread *td, struct fhlink_args *uap) { return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp)); } #ifndef _SYS_SYSPROTO_H_ struct fhlinkat_args { fhandle_t *fhp; int tofd; const char *to; }; #endif int sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap) { return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp)); } static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); error = copyin(fhp, &fh, sizeof(fh)); if (error != 0) return (error); do { bwillwrite(); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); VOP_UNLOCK(vp); error = kern_linkat_vp(td, vp, fd, path, pathseg); } while (error == EAGAIN || error == ERELOOKUP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhreadlink_args { fhandle_t *fhp; char *buf; size_t bufsize; }; #endif int sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); if (uap->bufsize > IOSIZE_MAX) return (EINVAL); error = copyin(uap->fhp, &fh, sizeof(fh)); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td); vput(vp); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int sys_fhopen(struct thread *td, struct fhopen_args *uap) { return (kern_fhopen(td, uap->u_fhp, uap->flags)); } int kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags) { struct mount *mp; struct vnode *vp; struct fhandle fhp; struct file *fp; int fmode, error; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) return (error); indx = -1; fmode = FFLAGS(flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(u_fhp, &fhp, sizeof(fhp)); if (error != 0) return(error); /* find the mount point */ mp = vfs_busyfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = falloc_noinstall(td, &fp); if (error != 0) { vput(vp); return (error); } /* * An extra reference on `fp' has been held for us by * falloc_noinstall(). */ #ifdef INVARIANTS td->td_dupfd = -1; #endif error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp); if (error != 0) { KASSERT(fp->f_ops == &badfileops, ("VOP_OPEN in fhopen() set f_ops")); KASSERT(td->td_dupfd < 0, ("fhopen() encountered fdopen()")); vput(vp); goto bad; } #ifdef INVARIANTS td->td_dupfd = 0; #endif fp->f_vnode = vp; finit_vnode(fp, fmode, NULL, &vnops); VOP_UNLOCK(vp); if ((fmode & O_TRUNC) != 0) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } error = finstall(td, fp, &indx, fmode, NULL); bad: fdrop(fp, td); td->td_retval[0] = indx; return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int sys_fhstat(struct thread *td, struct fhstat_args *uap) { struct stat sb; struct fhandle fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fh)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error == 0) error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } int kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td); vput(vp); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap) { struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(*sfp)); free(sfp, M_STATFS); return (error); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); if (error != 0) { vfs_unbusy(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error != 0) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); out: vfs_unbusy(mp); return (error); } /* * Unlike madvise(2), we do not make a best effort to remember every * possible caching hint. Instead, we remember the last setting with * the exception that we will allow POSIX_FADV_NORMAL to adjust the * region of any current setting. */ int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, int advice) { struct fadvise_info *fa, *new; struct file *fp; struct vnode *vp; off_t end; int error; if (offset < 0 || len < 0 || offset > OFF_MAX - len) return (EINVAL); AUDIT_ARG_VALUE(advice); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK); break; case POSIX_FADV_NORMAL: case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: new = NULL; break; default: return (EINVAL); } /* XXX: CAP_POSIX_FADVISE? */ AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_no_rights, &fp); if (error != 0) goto out; AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if (fp->f_type != DTYPE_VNODE) { error = ENODEV; goto out; } vp = fp->f_vnode; if (vp->v_type != VREG) { error = ENODEV; goto out; } if (len == 0) end = OFF_MAX; else end = offset + len - 1; switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: /* * Try to merge any existing non-standard region with * this new region if possible, otherwise create a new * non-standard region for this request. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL && fa->fa_advice == advice && ((fa->fa_start <= end && fa->fa_end >= offset) || (end != OFF_MAX && fa->fa_start == end + 1) || (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) { if (offset < fa->fa_start) fa->fa_start = offset; if (end > fa->fa_end) fa->fa_end = end; } else { new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; fp->f_advice = new; new = fa; } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_NORMAL: /* * If a the "normal" region overlaps with an existing * non-standard region, trim or remove the * non-standard region. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL) { if (offset <= fa->fa_start && end >= fa->fa_end) { new = fa; fp->f_advice = NULL; } else if (offset <= fa->fa_start && end >= fa->fa_start) fa->fa_start = end + 1; else if (offset <= fa->fa_end && end >= fa->fa_end) fa->fa_end = offset - 1; else if (offset >= fa->fa_start && end <= fa->fa_end) { /* * If the "normal" region is a middle * portion of the existing * non-standard region, just remove * the whole thing rather than picking * one side or the other to * preserve. */ new = fa; fp->f_advice = NULL; } } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: error = VOP_ADVISE(vp, offset, end, advice); break; } out: if (fp != NULL) fdrop(fp, td); free(new, M_FADVISE); return (error); } int sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) { int error; error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, uap->advice); return (kern_posix_error(td, error)); } int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, off_t *outoffp, size_t len, unsigned int flags) { struct file *infp, *outfp; struct vnode *invp, *outvp; int error; size_t retlen; void *rl_rcookie, *rl_wcookie; off_t savinoff, savoutoff; infp = outfp = NULL; rl_rcookie = rl_wcookie = NULL; savinoff = -1; error = 0; retlen = 0; if (flags != 0) { error = EINVAL; goto out; } if (len > SSIZE_MAX) /* * Although the len argument is size_t, the return argument * is ssize_t (which is signed). Therefore a size that won't * fit in ssize_t can't be returned. */ len = SSIZE_MAX; /* Get the file structures for the file descriptors. */ error = fget_read(td, infd, &cap_read_rights, &infp); if (error != 0) goto out; if (infp->f_ops == &badfileops) { error = EBADF; goto out; } if (infp->f_vnode == NULL) { error = EINVAL; goto out; } error = fget_write(td, outfd, &cap_write_rights, &outfp); if (error != 0) goto out; if (outfp->f_ops == &badfileops) { error = EBADF; goto out; } if (outfp->f_vnode == NULL) { error = EINVAL; goto out; } /* Set the offset pointers to the correct place. */ if (inoffp == NULL) inoffp = &infp->f_offset; if (outoffp == NULL) outoffp = &outfp->f_offset; savinoff = *inoffp; savoutoff = *outoffp; invp = infp->f_vnode; outvp = outfp->f_vnode; /* Sanity check the f_flag bits. */ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || (infp->f_flag & FREAD) == 0) { error = EBADF; goto out; } /* If len == 0, just return 0. */ if (len == 0) goto out; /* * If infp and outfp refer to the same file, the byte ranges cannot * overlap. */ if (invp == outvp && ((savinoff <= savoutoff && savinoff + len > savoutoff) || (savinoff > savoutoff && savoutoff + len > savinoff))) { error = EINVAL; goto out; } /* Range lock the byte ranges for both invp and outvp. */ for (;;) { rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + len); rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp + len); if (rl_rcookie != NULL) break; vn_rangelock_unlock(outvp, rl_wcookie); rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); vn_rangelock_unlock(invp, rl_rcookie); } retlen = len; error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, flags, infp->f_cred, outfp->f_cred, td); out: if (rl_rcookie != NULL) vn_rangelock_unlock(invp, rl_rcookie); if (rl_wcookie != NULL) vn_rangelock_unlock(outvp, rl_wcookie); if (savinoff != -1 && (error == EINTR || error == ERESTART)) { *inoffp = savinoff; *outoffp = savoutoff; } if (outfp != NULL) fdrop(outfp, td); if (infp != NULL) fdrop(infp, td); td->td_retval[0] = retlen; return (error); } int sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) { off_t inoff, outoff, *inoffp, *outoffp; int error; inoffp = outoffp = NULL; if (uap->inoffp != NULL) { error = copyin(uap->inoffp, &inoff, sizeof(off_t)); if (error != 0) return (error); inoffp = &inoff; } if (uap->outoffp != NULL) { error = copyin(uap->outoffp, &outoff, sizeof(off_t)); if (error != 0) return (error); outoffp = &outoff; } error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, outoffp, uap->len, uap->flags); if (error == 0 && uap->inoffp != NULL) error = copyout(inoffp, uap->inoffp, sizeof(off_t)); if (error == 0 && uap->outoffp != NULL) error = copyout(outoffp, uap->outoffp, sizeof(off_t)); return (error); } diff --git a/sys/netinet/in_jail.c b/sys/netinet/in_jail.c index 11891b56ecbe..3231ca214775 100644 --- a/sys/netinet/in_jail.c +++ b/sys/netinet/in_jail.c @@ -1,431 +1,430 @@ /*- * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include int prison_qcmp_v4(const void *ip1, const void *ip2) { in_addr_t iaa, iab; /* * We need to compare in HBO here to get the list sorted as expected * by the result of the code. Sorting NBO addresses gives you * interesting results. If you do not understand, do not try. */ iaa = ntohl(((const struct in_addr *)ip1)->s_addr); iab = ntohl(((const struct in_addr *)ip2)->s_addr); /* * Do not simply return the difference of the two numbers, the int is * not wide enough. */ if (iaa > iab) return (1); else if (iaa < iab) return (-1); else return (0); } /* * Restrict a prison's IP address list with its parent's, possibly replacing * it. Return true if the replacement buffer was used (or would have been). */ int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4) { int ii, ij, used; struct prison *ppr; ppr = pr->pr_parent; if (!(pr->pr_flags & PR_IP4_USER)) { /* This has no user settings, so just copy the parent's list. */ if (pr->pr_ip4s < ppr->pr_ip4s) { /* * There's no room for the parent's list. Use the * new list buffer, which is assumed to be big enough * (if it was passed). If there's no buffer, try to * allocate one. */ used = 1; if (newip4 == NULL) { newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4), M_PRISON, M_NOWAIT); if (newip4 != NULL) used = 0; } if (newip4 != NULL) { bcopy(ppr->pr_ip4, newip4, ppr->pr_ip4s * sizeof(*newip4)); free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = newip4; pr->pr_ip4s = ppr->pr_ip4s; } return (used); } pr->pr_ip4s = ppr->pr_ip4s; if (pr->pr_ip4s > 0) bcopy(ppr->pr_ip4, pr->pr_ip4, pr->pr_ip4s * sizeof(*newip4)); else if (pr->pr_ip4 != NULL) { free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = NULL; } } else if (pr->pr_ip4s > 0) { /* Remove addresses that aren't in the parent. */ for (ij = 0; ij < ppr->pr_ip4s; ij++) if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij < ppr->pr_ip4s) ii = 1; else { bcopy(pr->pr_ip4 + 1, pr->pr_ip4, --pr->pr_ip4s * sizeof(*pr->pr_ip4)); ii = 0; } for (ij = 1; ii < pr->pr_ip4s; ) { if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) { ii++; continue; } switch (ij >= ppr->pr_ip4s ? -1 : prison_qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) { case -1: bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii, (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4)); break; case 0: ii++; ij++; break; case 1: ij++; break; } } if (pr->pr_ip4s == 0) { free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = NULL; } } return (0); } /* * Pass back primary IPv4 address of this jail. * * If not restricted return success but do not alter the address. Caller has * to make sure to initialize it correctly (e.g. INADDR_ANY). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address returned in NBO. */ int prison_get_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } /* * Return 1 if we should do proper source address selection or are not jailed. * We will return 0 if we should bypass source address selection in favour * of the primary jail IPv4 address. Only in this case *ia will be updated and * returned in NBO. * Return EAFNOSUPPORT, in case this jail does not allow IPv4. */ int prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; struct in_addr lia; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); if (!jailed(cred)) return (1); pr = cred->cr_prison; if (pr->pr_flags & PR_IP4_SADDRSEL) return (1); lia.s_addr = INADDR_ANY; error = prison_get_ip4(cred, &lia); if (error) return (error); if (lia.s_addr == INADDR_ANY) return (1); ia->s_addr = lia.s_addr; return (0); } /* * Return true if pr1 and pr2 have the same IPv4 address restrictions. */ int prison_equal_ip4(struct prison *pr1, struct prison *pr2) { if (pr1 == pr2) return (1); /* * No need to lock since the PR_IP4_USER flag can't be altered for * existing prisons. */ while (pr1 != &prison0 && #ifdef VIMAGE !(pr1->pr_flags & PR_VNET) && #endif !(pr1->pr_flags & PR_IP4_USER)) pr1 = pr1->pr_parent; while (pr2 != &prison0 && #ifdef VIMAGE !(pr2->pr_flags & PR_VNET) && #endif !(pr2->pr_flags & PR_IP4_USER)) pr2 = pr2->pr_parent; return (pr1 == pr2); } /* * Make sure our (source) address is set to something meaningful to this * jail. * * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv4. Address passed in in NBO and returned in NBO. */ int prison_local_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; struct in_addr ia0; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } ia0.s_addr = ntohl(ia->s_addr); if (ia0.s_addr == INADDR_ANY) { /* * In case there is only 1 IPv4 address, bind directly. */ if (pr->pr_ip4s == 1) ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } error = prison_check_ip4_locked(pr, ia); if (error == EADDRNOTAVAIL && ia0.s_addr == INADDR_LOOPBACK) { ia->s_addr = pr->pr_ip4[0].s_addr; error = 0; } mtx_unlock(&pr->pr_mtx); return (error); } /* * Rewrite destination address in case we will connect to loopback address. * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address passed in in NBO and returned in NBO. */ int prison_remote_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (ntohl(ia->s_addr) == INADDR_LOOPBACK && prison_check_ip4_locked(pr, ia) == EADDRNOTAVAIL) { ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } /* * Return success because nothing had to be changed. */ mtx_unlock(&pr->pr_mtx); return (0); } /* * Check if given address belongs to the jail referenced by cred/prison. * * Returns 0 if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong to the jail. */ int prison_check_ip4_locked(const struct prison *pr, const struct in_addr *ia) { int i, a, z, d; /* * Check the primary IP. */ if (pr->pr_ip4[0].s_addr == ia->s_addr) return (0); /* * All the other IPs are sorted so we can do a binary search. */ a = 0; z = pr->pr_ip4s - 2; while (a <= z) { i = (a + z) / 2; d = prison_qcmp_v4(&pr->pr_ip4[i+1], ia); if (d > 0) z = i - 1; else if (d < 0) a = i + 1; else return (0); } return (EADDRNOTAVAIL); } int prison_check_ip4(const struct ucred *cred, const struct in_addr *ia) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } error = prison_check_ip4_locked(pr, ia); mtx_unlock(&pr->pr_mtx); return (error); } diff --git a/sys/netinet6/in6_jail.c b/sys/netinet6/in6_jail.c index 65b397f69bf2..0f83344cd668 100644 --- a/sys/netinet6/in6_jail.c +++ b/sys/netinet6/in6_jail.c @@ -1,417 +1,416 @@ /*- * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include int prison_qcmp_v6(const void *ip1, const void *ip2) { const struct in6_addr *ia6a, *ia6b; int i, rc; ia6a = (const struct in6_addr *)ip1; ia6b = (const struct in6_addr *)ip2; rc = 0; for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) { if (ia6a->s6_addr[i] > ia6b->s6_addr[i]) rc = 1; else if (ia6a->s6_addr[i] < ia6b->s6_addr[i]) rc = -1; } return (rc); } int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6) { int ii, ij, used; struct prison *ppr; ppr = pr->pr_parent; if (!(pr->pr_flags & PR_IP6_USER)) { /* This has no user settings, so just copy the parent's list. */ if (pr->pr_ip6s < ppr->pr_ip6s) { /* * There's no room for the parent's list. Use the * new list buffer, which is assumed to be big enough * (if it was passed). If there's no buffer, try to * allocate one. */ used = 1; if (newip6 == NULL) { newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6), M_PRISON, M_NOWAIT); if (newip6 != NULL) used = 0; } if (newip6 != NULL) { bcopy(ppr->pr_ip6, newip6, ppr->pr_ip6s * sizeof(*newip6)); free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = newip6; pr->pr_ip6s = ppr->pr_ip6s; } return (used); } pr->pr_ip6s = ppr->pr_ip6s; if (pr->pr_ip6s > 0) bcopy(ppr->pr_ip6, pr->pr_ip6, pr->pr_ip6s * sizeof(*newip6)); else if (pr->pr_ip6 != NULL) { free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = NULL; } } else if (pr->pr_ip6s > 0) { /* Remove addresses that aren't in the parent. */ for (ij = 0; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], &ppr->pr_ip6[ij])) break; if (ij < ppr->pr_ip6s) ii = 1; else { bcopy(pr->pr_ip6 + 1, pr->pr_ip6, --pr->pr_ip6s * sizeof(*pr->pr_ip6)); ii = 0; } for (ij = 1; ii < pr->pr_ip6s; ) { if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii], &ppr->pr_ip6[0])) { ii++; continue; } switch (ij >= ppr->pr_ip6s ? -1 : prison_qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) { case -1: bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii, (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6)); break; case 0: ii++; ij++; break; case 1: ij++; break; } } if (pr->pr_ip6s == 0) { free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = NULL; } } return 0; } /* * Pass back primary IPv6 address for this jail. * * If not restricted return success but do not alter the address. Caller has * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ int prison_get_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return 1 if we should do proper source address selection or are not jailed. * We will return 0 if we should bypass source address selection in favour * of the primary jail IPv6 address. Only in this case *ia will be updated and * returned in NBO. * Return EAFNOSUPPORT, in case this jail does not allow IPv6. */ int prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; struct in6_addr lia6; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); if (!jailed(cred)) return (1); pr = cred->cr_prison; if (pr->pr_flags & PR_IP6_SADDRSEL) return (1); lia6 = in6addr_any; error = prison_get_ip6(cred, &lia6); if (error) return (error); if (IN6_IS_ADDR_UNSPECIFIED(&lia6)) return (1); bcopy(&lia6, ia6, sizeof(struct in6_addr)); return (0); } /* * Return true if pr1 and pr2 have the same IPv6 address restrictions. */ int prison_equal_ip6(struct prison *pr1, struct prison *pr2) { if (pr1 == pr2) return (1); while (pr1 != &prison0 && #ifdef VIMAGE !(pr1->pr_flags & PR_VNET) && #endif !(pr1->pr_flags & PR_IP6_USER)) pr1 = pr1->pr_parent; while (pr2 != &prison0 && #ifdef VIMAGE !(pr2->pr_flags & PR_VNET) && #endif !(pr2->pr_flags & PR_IP6_USER)) pr2 = pr2->pr_parent; return (pr1 == pr2); } /* * Make sure our (source) address is set to something meaningful to this jail. * * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0) * when needed while binding. * * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv6. */ int prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (IN6_IS_ADDR_UNSPECIFIED(ia6)) { /* * In case there is only 1 IPv6 address, and v6only is true, * then bind directly. */ if (v6only != 0 && pr->pr_ip6s == 1) bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } error = prison_check_ip6_locked(pr, ia6); if (error == EADDRNOTAVAIL && IN6_IS_ADDR_LOOPBACK(ia6)) { bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); error = 0; } mtx_unlock(&pr->pr_mtx); return (error); } /* * Rewrite destination address in case we will connect to loopback address. * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ int prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (IN6_IS_ADDR_LOOPBACK(ia6) && prison_check_ip6_locked(pr, ia6) == EADDRNOTAVAIL) { bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return success because nothing had to be changed. */ mtx_unlock(&pr->pr_mtx); return (0); } /* * Check if given address belongs to the jail referenced by cred/prison. * * Returns 0 if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong to the jail. */ int prison_check_ip6_locked(const struct prison *pr, const struct in6_addr *ia6) { int i, a, z, d; /* * Check the primary IP. */ if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6)) return (0); /* * All the other IPs are sorted so we can do a binary search. */ a = 0; z = pr->pr_ip6s - 2; while (a <= z) { i = (a + z) / 2; d = prison_qcmp_v6(&pr->pr_ip6[i+1], ia6); if (d > 0) z = i - 1; else if (d < 0) a = i + 1; else return (0); } return (EADDRNOTAVAIL); } int prison_check_ip6(const struct ucred *cred, const struct in6_addr *ia6) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } error = prison_check_ip6_locked(pr, ia6); mtx_unlock(&pr->pr_mtx); return (error); } diff --git a/sys/security/mac/mac_syscalls.c b/sys/security/mac/mac_syscalls.c index e7d71b2e22da..949421ecb0ab 100644 --- a/sys/security/mac/mac_syscalls.c +++ b/sys/security/mac/mac_syscalls.c @@ -1,663 +1,662 @@ /*- * Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2005 Networks Associates Technology, Inc. * Copyright (c) 2005-2006 SPARTA, Inc. * Copyright (c) 2008 Apple Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #ifdef MAC FEATURE(security_mac, "Mandatory Access Control Framework support"); static int kern___mac_get_path(struct thread *td, const char *path_p, struct mac *mac_p, int follow); static int kern___mac_set_path(struct thread *td, const char *path_p, struct mac *mac_p, int follow); int sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap) { char *elements, *buffer; struct mac mac; struct proc *tproc; struct ucred *tcred; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); tproc = pfind(uap->pid); if (tproc == NULL) return (ESRCH); tcred = NULL; /* Satisfy gcc. */ error = p_cansee(td, tproc); if (error == 0) tcred = crhold(tproc->p_ucred); PROC_UNLOCK(tproc); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); crfree(tcred); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = mac_cred_externalize_label(tcred->cr_label, elements, buffer, mac.m_buflen); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); free(buffer, M_MACTEMP); free(elements, M_MACTEMP); crfree(tcred); return (error); } int sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap) { char *elements, *buffer; struct mac mac; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = mac_cred_externalize_label(td->td_ucred->cr_label, elements, buffer, mac.m_buflen); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap) { struct ucred *newcred, *oldcred; struct label *intlabel; struct proc *p; struct mac mac; char *buffer; int error; if (!(mac_labeled & MPC_OBJECT_CRED)) return (EINVAL); error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_cred_label_alloc(); error = mac_cred_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; error = mac_cred_check_relabel(oldcred, intlabel); if (error) { PROC_UNLOCK(p); crfree(newcred); goto out; } setsugid(p); crcopy(newcred, oldcred); mac_cred_relabel(newcred, intlabel); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); mac_proc_vm_revoke(td); out: mac_cred_label_free(intlabel); return (error); } int sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap) { char *elements, *buffer; struct label *intlabel; struct file *fp; struct mac mac; struct vnode *vp; struct pipe *pipe; struct socket *so; cap_rights_t rights; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = fget(td, uap->fd, cap_rights_init_one(&rights, CAP_MAC_GET), &fp); if (error) goto out; switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: if (!(mac_labeled & MPC_OBJECT_VNODE)) { error = EINVAL; goto out_fdrop; } vp = fp->f_vnode; intlabel = mac_vnode_label_alloc(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mac_vnode_copy_label(vp->v_label, intlabel); VOP_UNLOCK(vp); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_vnode_label_free(intlabel); break; case DTYPE_PIPE: if (!(mac_labeled & MPC_OBJECT_PIPE)) { error = EINVAL; goto out_fdrop; } pipe = fp->f_data; intlabel = mac_pipe_label_alloc(); PIPE_LOCK(pipe); mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel); PIPE_UNLOCK(pipe); error = mac_pipe_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_pipe_label_free(intlabel); break; case DTYPE_SOCKET: if (!(mac_labeled & MPC_OBJECT_SOCKET)) { error = EINVAL; goto out_fdrop; } so = fp->f_data; intlabel = mac_socket_label_alloc(M_WAITOK); SOCK_LOCK(so); mac_socket_copy_label(so->so_label, intlabel); SOCK_UNLOCK(so); error = mac_socket_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_socket_label_free(intlabel); break; default: error = EINVAL; } if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out_fdrop: fdrop(fp, td); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap) { return (kern___mac_get_path(td, uap->path_p, uap->mac_p, FOLLOW)); } int sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap) { return (kern___mac_get_path(td, uap->path_p, uap->mac_p, NOFOLLOW)); } static int kern___mac_get_path(struct thread *td, const char *path_p, struct mac *mac_p, int follow) { char *elements, *buffer; struct nameidata nd; struct label *intlabel; struct mac mac; int error; if (!(mac_labeled & MPC_OBJECT_VNODE)) return (EINVAL); error = copyin(mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); NDINIT(&nd, LOOKUP, LOCKLEAF | follow, UIO_USERSPACE, path_p, td); error = namei(&nd); if (error) goto out; intlabel = mac_vnode_label_alloc(); mac_vnode_copy_label(nd.ni_vp->v_label, intlabel); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); NDFREE(&nd, 0); mac_vnode_label_free(intlabel); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap) { struct label *intlabel; struct pipe *pipe; struct socket *so; struct file *fp; struct mount *mp; struct vnode *vp; struct mac mac; cap_rights_t rights; char *buffer; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } error = fget(td, uap->fd, cap_rights_init_one(&rights, CAP_MAC_SET), &fp); if (error) goto out; switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: if (!(mac_labeled & MPC_OBJECT_VNODE)) { error = EINVAL; goto out_fdrop; } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); if (error) { mac_vnode_label_free(intlabel); break; } vp = fp->f_vnode; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) { mac_vnode_label_free(intlabel); break; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = vn_setlabel(vp, intlabel, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); mac_vnode_label_free(intlabel); break; case DTYPE_PIPE: if (!(mac_labeled & MPC_OBJECT_PIPE)) { error = EINVAL; goto out_fdrop; } intlabel = mac_pipe_label_alloc(); error = mac_pipe_internalize_label(intlabel, buffer); if (error == 0) { pipe = fp->f_data; PIPE_LOCK(pipe); error = mac_pipe_label_set(td->td_ucred, pipe->pipe_pair, intlabel); PIPE_UNLOCK(pipe); } mac_pipe_label_free(intlabel); break; case DTYPE_SOCKET: if (!(mac_labeled & MPC_OBJECT_SOCKET)) { error = EINVAL; goto out_fdrop; } intlabel = mac_socket_label_alloc(M_WAITOK); error = mac_socket_internalize_label(intlabel, buffer); if (error == 0) { so = fp->f_data; error = mac_socket_label_set(td->td_ucred, so, intlabel); } mac_socket_label_free(intlabel); break; default: error = EINVAL; } out_fdrop: fdrop(fp, td); out: free(buffer, M_MACTEMP); return (error); } int sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap) { return (kern___mac_set_path(td, uap->path_p, uap->mac_p, FOLLOW)); } int sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap) { return (kern___mac_set_path(td, uap->path_p, uap->mac_p, NOFOLLOW)); } static int kern___mac_set_path(struct thread *td, const char *path_p, struct mac *mac_p, int follow) { struct label *intlabel; struct nameidata nd; struct mount *mp; struct mac mac; char *buffer; int error; if (!(mac_labeled & MPC_OBJECT_VNODE)) return (EINVAL); error = copyin(mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; NDINIT(&nd, LOOKUP, LOCKLEAF | follow, UIO_USERSPACE, path_p, td); error = namei(&nd); if (error == 0) { error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); if (error == 0) { error = vn_setlabel(nd.ni_vp, intlabel, td->td_ucred); vn_finished_write(mp); } } NDFREE(&nd, 0); out: mac_vnode_label_free(intlabel); return (error); } int sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap) { struct mac_policy_conf *mpc; char target[MAC_MAX_POLICY_NAME]; int error; error = copyinstr(uap->policy, target, sizeof(target), NULL); if (error) return (error); error = ENOSYS; LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { if (strcmp(mpc->mpc_name, target) == 0 && mpc->mpc_ops->mpo_syscall != NULL) { error = mpc->mpc_ops->mpo_syscall(td, uap->call, uap->arg); goto out; } } if (!LIST_EMPTY(&mac_policy_list)) { mac_policy_slock_sleep(); LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { if (strcmp(mpc->mpc_name, target) == 0 && mpc->mpc_ops->mpo_syscall != NULL) { error = mpc->mpc_ops->mpo_syscall(td, uap->call, uap->arg); break; } } mac_policy_sunlock_sleep(); } out: return (error); } #else /* !MAC */ int sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap) { return (ENOSYS); } int sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap) { return (ENOSYS); } int sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap) { return (ENOSYS); } int sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap) { return (ENOSYS); } int sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap) { return (ENOSYS); } int sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap) { return (ENOSYS); } int sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap) { return (ENOSYS); } int sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap) { return (ENOSYS); } int sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap) { return (ENOSYS); } int sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap) { return (ENOSYS); } #endif /* !MAC */