Index: head/sys/cam/ctl/ctl_backend_block.c =================================================================== --- head/sys/cam/ctl/ctl_backend_block.c (revision 292383) +++ head/sys/cam/ctl/ctl_backend_block.c (revision 292384) @@ -1,2880 +1,2880 @@ /*- * Copyright (c) 2003 Silicon Graphics International Corp. * Copyright (c) 2009-2011 Spectra Logic Corporation * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2014-2015 Alexander Motin * All rights reserved. * * Portions of this software were developed by Edward Tomasz Napierala * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $ */ /* * CAM Target Layer driver backend for block devices. * * Author: Ken Merry */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The idea here is that we'll allocate enough S/G space to hold a 1MB * I/O. If we get an I/O larger than that, we'll split it. */ #define CTLBLK_HALF_IO_SIZE (512 * 1024) #define CTLBLK_MAX_IO_SIZE (CTLBLK_HALF_IO_SIZE * 2) #define CTLBLK_MAX_SEG MAXPHYS #define CTLBLK_HALF_SEGS MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1) #define CTLBLK_MAX_SEGS (CTLBLK_HALF_SEGS * 2) #ifdef CTLBLK_DEBUG #define DPRINTF(fmt, args...) \ printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTF(fmt, args...) do {} while(0) #endif #define PRIV(io) \ ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND]) #define ARGS(io) \ ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]) SDT_PROVIDER_DEFINE(cbb); typedef enum { CTL_BE_BLOCK_LUN_UNCONFIGURED = 0x01, CTL_BE_BLOCK_LUN_CONFIG_ERR = 0x02, CTL_BE_BLOCK_LUN_WAITING = 0x04, } ctl_be_block_lun_flags; typedef enum { CTL_BE_BLOCK_NONE, CTL_BE_BLOCK_DEV, CTL_BE_BLOCK_FILE } ctl_be_block_type; struct ctl_be_block_filedata { struct ucred *cred; }; union ctl_be_block_bedata { struct ctl_be_block_filedata file; }; struct ctl_be_block_io; struct ctl_be_block_lun; typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun, const char *attrname); /* * Backend LUN structure. There is a 1:1 mapping between a block device * and a backend block LUN, and between a backend block LUN and a CTL LUN. */ struct ctl_be_block_lun { struct ctl_lun_create_params params; char lunname[32]; char *dev_path; ctl_be_block_type dev_type; struct vnode *vn; union ctl_be_block_bedata backend; cbb_dispatch_t dispatch; cbb_dispatch_t lun_flush; cbb_dispatch_t unmap; cbb_dispatch_t get_lba_status; cbb_getattr_t getattr; uma_zone_t lun_zone; uint64_t size_blocks; uint64_t size_bytes; struct ctl_be_block_softc *softc; struct devstat *disk_stats; ctl_be_block_lun_flags flags; STAILQ_ENTRY(ctl_be_block_lun) links; struct ctl_be_lun cbe_lun; struct taskqueue *io_taskqueue; struct task io_task; int num_threads; STAILQ_HEAD(, ctl_io_hdr) input_queue; STAILQ_HEAD(, ctl_io_hdr) config_read_queue; STAILQ_HEAD(, ctl_io_hdr) config_write_queue; STAILQ_HEAD(, ctl_io_hdr) datamove_queue; struct mtx_padalign io_lock; struct mtx_padalign queue_lock; }; /* * Overall softc structure for the block backend module. */ struct ctl_be_block_softc { struct mtx lock; int num_luns; STAILQ_HEAD(, ctl_be_block_lun) lun_list; }; static struct ctl_be_block_softc backend_block_softc; /* * Per-I/O information. */ struct ctl_be_block_io { union ctl_io *io; struct ctl_sg_entry sg_segs[CTLBLK_MAX_SEGS]; struct iovec xiovecs[CTLBLK_MAX_SEGS]; int bio_cmd; int num_segs; int num_bios_sent; int num_bios_done; int send_complete; int num_errors; struct bintime ds_t0; devstat_tag_type ds_tag_type; devstat_trans_flags ds_trans_type; uint64_t io_len; uint64_t io_offset; int io_arg; struct ctl_be_block_softc *softc; struct ctl_be_block_lun *lun; void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */ }; extern struct ctl_softc *control_softc; static int cbb_num_threads = 14; SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0, "CAM Target Layer Block Backend"); SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN, &cbb_num_threads, 0, "Number of threads per backing file"); static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc); static void ctl_free_beio(struct ctl_be_block_io *beio); static void ctl_complete_beio(struct ctl_be_block_io *beio); static int ctl_be_block_move_done(union ctl_io *io); static void ctl_be_block_biodone(struct bio *bio); static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname); static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname); static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_worker(void *context, int pending); static int ctl_be_block_submit(union ctl_io *io); static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td); static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_close(struct ctl_be_block_lun *be_lun); static int ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static int ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static int ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static void ctl_be_block_lun_shutdown(void *be_lun); static void ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status); static int ctl_be_block_config_write(union ctl_io *io); static int ctl_be_block_config_read(union ctl_io *io); static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb); static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname); int ctl_be_block_init(void); static struct ctl_backend_driver ctl_be_block_driver = { .name = "block", .flags = CTL_BE_FLAG_HAS_CONFIG, .init = ctl_be_block_init, .data_submit = ctl_be_block_submit, .data_move_done = ctl_be_block_move_done, .config_read = ctl_be_block_config_read, .config_write = ctl_be_block_config_write, .ioctl = ctl_be_block_ioctl, .lun_info = ctl_be_block_lun_info, .lun_attr = ctl_be_block_lun_attr }; MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend"); CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver); static uma_zone_t beio_zone; static struct ctl_be_block_io * ctl_alloc_beio(struct ctl_be_block_softc *softc) { struct ctl_be_block_io *beio; beio = uma_zalloc(beio_zone, M_WAITOK | M_ZERO); beio->softc = softc; return (beio); } static void ctl_free_beio(struct ctl_be_block_io *beio) { int duplicate_free; int i; duplicate_free = 0; for (i = 0; i < beio->num_segs; i++) { if (beio->sg_segs[i].addr == NULL) duplicate_free++; uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr); beio->sg_segs[i].addr = NULL; /* For compare we had two equal S/G lists. */ if (ARGS(beio->io)->flags & CTL_LLF_COMPARE) { uma_zfree(beio->lun->lun_zone, beio->sg_segs[i + CTLBLK_HALF_SEGS].addr); beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = NULL; } } if (duplicate_free > 0) { printf("%s: %d duplicate frees out of %d segments\n", __func__, duplicate_free, beio->num_segs); } uma_zfree(beio_zone, beio); } static void ctl_complete_beio(struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; if (beio->beio_cont != NULL) { beio->beio_cont(beio); } else { ctl_free_beio(beio); ctl_data_submit_done(io); } } static size_t cmp(uint8_t *a, uint8_t *b, size_t size) { size_t i; for (i = 0; i < size; i++) { if (a[i] != b[i]) break; } return (i); } static void ctl_be_block_compare(union ctl_io *io) { struct ctl_be_block_io *beio; uint64_t off, res; int i; uint8_t info[8]; beio = (struct ctl_be_block_io *)PRIV(io)->ptr; off = 0; for (i = 0; i < beio->num_segs; i++) { res = cmp(beio->sg_segs[i].addr, beio->sg_segs[i + CTLBLK_HALF_SEGS].addr, beio->sg_segs[i].len); off += res; if (res < beio->sg_segs[i].len) break; } if (i < beio->num_segs) { scsi_u64to8b(off, info); ctl_set_sense(&io->scsiio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_MISCOMPARE, /*asc*/ 0x1D, /*ascq*/ 0x00, /*type*/ SSD_ELEM_INFO, /*size*/ sizeof(info), /*data*/ &info, /*type*/ SSD_ELEM_NONE); } else ctl_set_success(&io->scsiio); } static int ctl_be_block_move_done(union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_lun *be_lun; struct ctl_lba_len_flags *lbalen; #ifdef CTL_TIME_IO struct bintime cur_bt; #endif beio = (struct ctl_be_block_io *)PRIV(io)->ptr; be_lun = beio->lun; DPRINTF("entered\n"); #ifdef CTL_TIME_IO getbinuptime(&cur_bt); bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt); bintime_add(&io->io_hdr.dma_bt, &cur_bt); #endif io->io_hdr.num_dmas++; io->scsiio.kern_rel_offset += io->scsiio.kern_data_len; /* * We set status at this point for read commands, and write * commands with errors. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { ; } else if ((io->io_hdr.port_status == 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) { lbalen = ARGS(beio->io); if (lbalen->flags & CTL_LLF_READ) { ctl_set_success(&io->scsiio); } else if (lbalen->flags & CTL_LLF_COMPARE) { /* We have two data blocks ready for comparison. */ ctl_be_block_compare(io); } } else if ((io->io_hdr.port_status != 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { /* * For hardware error sense keys, the sense key * specific value is defined to be a retry count, * but we use it to pass back an internal FETD * error code. XXX KDM Hopefully the FETD is only * using 16 bits for an error code, since that's * all the space we have in the sks field. */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ io->io_hdr.port_status); } /* * If this is a read, or a write with errors, it is done. */ if ((beio->bio_cmd == BIO_READ) || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) { ctl_complete_beio(beio); return (0); } /* * At this point, we have a write and the DMA completed * successfully. We now have to queue it to the task queue to * execute the backend I/O. That is because we do blocking * memory allocations, and in the file backing case, blocking I/O. * This move done routine is generally called in the SIM's * interrupt context, and therefore we cannot block. */ mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); return (0); } static void ctl_be_block_biodone(struct bio *bio) { struct ctl_be_block_io *beio; struct ctl_be_block_lun *be_lun; union ctl_io *io; int error; beio = bio->bio_caller1; be_lun = beio->lun; io = beio->io; DPRINTF("entered\n"); error = bio->bio_error; mtx_lock(&be_lun->io_lock); if (error != 0) beio->num_errors++; beio->num_bios_done++; /* * XXX KDM will this cause WITNESS to complain? Holding a lock * during the free might cause it to complain. */ g_destroy_bio(bio); /* * If the send complete bit isn't set, or we aren't the last I/O to * complete, then we're done. */ if ((beio->send_complete == 0) || (beio->num_bios_done < beio->num_bios_sent)) { mtx_unlock(&be_lun->io_lock); return; } /* * At this point, we've verified that we are the last I/O to * complete, so it's safe to drop the lock. */ devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If there are any errors from the backing device, we fail the * entire I/O with a medium error. */ if (beio->num_errors > 0) { if (error == EOPNOTSUPP) { ctl_set_invalid_opcode(&io->scsiio); } else if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else if (beio->bio_cmd == BIO_FLUSH) { /* XXX KDM is there is a better error here? */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ 0xbad2); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write, a flush, a delete or verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (beio->bio_cmd == BIO_FLUSH) || (beio->bio_cmd == BIO_DELETE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct mount *mountpoint; int error, lock_flags; DPRINTF("entered\n"); binuptime(&beio->ds_t0); mtx_lock(&be_lun->io_lock); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); mtx_unlock(&be_lun->io_lock); (void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT); if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_lock(be_lun->vn, lock_flags | LK_RETRY); error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT, curthread); VOP_UNLOCK(be_lun->vn, 0); vn_finished_write(mountpoint); mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); if (error == 0) ctl_set_success(&io->scsiio); else { /* XXX KDM is there is a better error here? */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ 0xbad1); } ctl_complete_beio(beio); } -SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, "uint64_t"); -SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, "uint64_t"); -SDT_PROBE_DEFINE1(cbb, kernel, read, file_done,"uint64_t"); -SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, "uint64_t"); +SDT_PROBE_DEFINE1(cbb, , read, file_start, "uint64_t"); +SDT_PROBE_DEFINE1(cbb, , write, file_start, "uint64_t"); +SDT_PROBE_DEFINE1(cbb, , read, file_done,"uint64_t"); +SDT_PROBE_DEFINE1(cbb, , write, file_done, "uint64_t"); static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { struct ctl_be_block_filedata *file_data; union ctl_io *io; struct uio xuio; struct iovec *xiovec; size_t s; int error, flags, i; DPRINTF("entered\n"); file_data = &be_lun->backend.file; io = beio->io; flags = 0; if (ARGS(io)->flags & CTL_LLF_DPO) flags |= IO_DIRECT; if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA) flags |= IO_SYNC; bzero(&xuio, sizeof(xuio)); if (beio->bio_cmd == BIO_READ) { - SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , read, file_start); xuio.uio_rw = UIO_READ; } else { - SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , write, file_start); xuio.uio_rw = UIO_WRITE; } xuio.uio_offset = beio->io_offset; xuio.uio_resid = beio->io_len; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = beio->xiovecs; xuio.uio_iovcnt = beio->num_segs; xuio.uio_td = curthread; for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) { xiovec->iov_base = beio->sg_segs[i].addr; xiovec->iov_len = beio->sg_segs[i].len; } binuptime(&beio->ds_t0); mtx_lock(&be_lun->io_lock); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); mtx_unlock(&be_lun->io_lock); if (beio->bio_cmd == BIO_READ) { vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); /* * UFS pays attention to IO_DIRECT for reads. If the * DIRECTIO option is configured into the kernel, it calls * ffs_rawread(). But that only works for single-segment * uios with user space addresses. In our case, with a * kernel uio, it still reads into the buffer cache, but it * will just try to release the buffer from the cache later * on in ffs_read(). * * ZFS does not pay attention to IO_DIRECT for reads. * * UFS does not pay attention to IO_SYNC for reads. * * ZFS pays attention to IO_SYNC (which translates into the * Solaris define FRSYNC for zfs_read()) for reads. It * attempts to sync the file before reading. */ error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred); VOP_UNLOCK(be_lun->vn, 0); - SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , read, file_done); if (error == 0 && xuio.uio_resid > 0) { /* * If we red less then requested (EOF), then * we should clean the rest of the buffer. */ s = beio->io_len - xuio.uio_resid; for (i = 0; i < beio->num_segs; i++) { if (s >= beio->sg_segs[i].len) { s -= beio->sg_segs[i].len; continue; } bzero((uint8_t *)beio->sg_segs[i].addr + s, beio->sg_segs[i].len - s); s = 0; } } } else { struct mount *mountpoint; int lock_flags; (void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT); if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_lock(be_lun->vn, lock_flags | LK_RETRY); /* * UFS pays attention to IO_DIRECT for writes. The write * is done asynchronously. (Normally the write would just * get put into cache. * * UFS pays attention to IO_SYNC for writes. It will * attempt to write the buffer out synchronously if that * flag is set. * * ZFS does not pay attention to IO_DIRECT for writes. * * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) * for writes. It will flush the transaction from the * cache before returning. */ error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred); VOP_UNLOCK(be_lun->vn, 0); vn_finished_write(mountpoint); - SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , write, file_done); } mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If we got an error, set the sense data to "MEDIUM ERROR" and * return the I/O to the user. */ if (error != 0) { if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write or a verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct ctl_lba_len_flags *lbalen = ARGS(io); struct scsi_get_lba_status_data *data; off_t roff, off; int error, status; DPRINTF("entered\n"); off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize; vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off, 0, curthread->td_ucred, curthread); if (error == 0 && off > roff) status = 0; /* mapped up to off */ else { error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off, 0, curthread->td_ucred, curthread); if (error == 0 && off > roff) status = 1; /* deallocated up to off */ else { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; } } VOP_UNLOCK(be_lun->vn, 0); data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr; scsi_u64to8b(lbalen->lba, data->descr[0].addr); scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize - lbalen->lba), data->descr[0].length); data->descr[0].status = status; ctl_complete_beio(beio); } static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname) { struct vattr vattr; struct statfs statfs; uint64_t val; int error; val = UINT64_MAX; if (be_lun->vn == NULL) return (val); vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); if (strcmp(attrname, "blocksused") == 0) { error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred); if (error == 0) val = vattr.va_bytes / be_lun->cbe_lun.blocksize; } if (strcmp(attrname, "blocksavail") == 0 && (be_lun->vn->v_iflag & VI_DOOMED) == 0) { error = VFS_STATFS(be_lun->vn->v_mount, &statfs); if (error == 0) val = statfs.f_bavail * statfs.f_bsize / be_lun->cbe_lun.blocksize; } VOP_UNLOCK(be_lun->vn, 0); return (val); } static void ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io; struct cdevsw *csw; struct cdev *dev; struct uio xuio; struct iovec *xiovec; int error, flags, i, ref; DPRINTF("entered\n"); io = beio->io; flags = 0; if (ARGS(io)->flags & CTL_LLF_DPO) flags |= IO_DIRECT; if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA) flags |= IO_SYNC; bzero(&xuio, sizeof(xuio)); if (beio->bio_cmd == BIO_READ) { - SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , read, file_start); xuio.uio_rw = UIO_READ; } else { - SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , write, file_start); xuio.uio_rw = UIO_WRITE; } xuio.uio_offset = beio->io_offset; xuio.uio_resid = beio->io_len; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = beio->xiovecs; xuio.uio_iovcnt = beio->num_segs; xuio.uio_td = curthread; for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) { xiovec->iov_base = beio->sg_segs[i].addr; xiovec->iov_len = beio->sg_segs[i].len; } binuptime(&beio->ds_t0); mtx_lock(&be_lun->io_lock); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); mtx_unlock(&be_lun->io_lock); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw) { if (beio->bio_cmd == BIO_READ) error = csw->d_read(dev, &xuio, flags); else error = csw->d_write(dev, &xuio, flags); dev_relthread(dev, ref); } else error = ENXIO; if (beio->bio_cmd == BIO_READ) - SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , read, file_done); else - SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , write, file_done); mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If we got an error, set the sense data to "MEDIUM ERROR" and * return the I/O to the user. */ if (error != 0) { if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write or a verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct cdevsw *csw; struct cdev *dev; struct ctl_lba_len_flags *lbalen = ARGS(io); struct scsi_get_lba_status_data *data; off_t roff, off; int error, ref, status; DPRINTF("entered\n"); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; goto done; } off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize; error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD, curthread); if (error == 0 && off > roff) status = 0; /* mapped up to off */ else { error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD, curthread); if (error == 0 && off > roff) status = 1; /* deallocated up to off */ else { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; } } dev_relthread(dev, ref); done: data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr; scsi_u64to8b(lbalen->lba, data->descr[0].addr); scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize - lbalen->lba), data->descr[0].length); data->descr[0].status = status; ctl_complete_beio(beio); } static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { struct bio *bio; struct cdevsw *csw; struct cdev *dev; int ref; DPRINTF("entered\n"); /* This can't fail, it's a blocking allocation. */ bio = g_alloc_bio(); bio->bio_cmd = BIO_FLUSH; bio->bio_offset = 0; bio->bio_data = 0; bio->bio_done = ctl_be_block_biodone; bio->bio_caller1 = beio; bio->bio_pblkno = 0; /* * We don't need to acquire the LUN lock here, because we are only * sending one bio, and so there is no other context to synchronize * with. */ beio->num_bios_sent = 1; beio->send_complete = 1; binuptime(&beio->ds_t0); mtx_lock(&be_lun->io_lock); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); mtx_unlock(&be_lun->io_lock); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw) { bio->bio_dev = dev; csw->d_strategy(bio); dev_relthread(dev, ref); } else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } static void ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio, uint64_t off, uint64_t len, int last) { struct bio *bio; uint64_t maxlen; struct cdevsw *csw; struct cdev *dev; int ref; csw = devvn_refthread(be_lun->vn, &dev, &ref); maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize); while (len > 0) { bio = g_alloc_bio(); bio->bio_cmd = BIO_DELETE; bio->bio_dev = dev; bio->bio_offset = off; bio->bio_length = MIN(len, maxlen); bio->bio_data = 0; bio->bio_done = ctl_be_block_biodone; bio->bio_caller1 = beio; bio->bio_pblkno = off / be_lun->cbe_lun.blocksize; off += bio->bio_length; len -= bio->bio_length; mtx_lock(&be_lun->io_lock); beio->num_bios_sent++; if (last && len == 0) beio->send_complete = 1; mtx_unlock(&be_lun->io_lock); if (csw) { csw->d_strategy(bio); } else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } if (csw) dev_relthread(dev, ref); } static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io; struct ctl_ptr_len_flags *ptrlen; struct scsi_unmap_desc *buf, *end; uint64_t len; io = beio->io; DPRINTF("entered\n"); binuptime(&beio->ds_t0); mtx_lock(&be_lun->io_lock); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); mtx_unlock(&be_lun->io_lock); if (beio->io_offset == -1) { beio->io_len = 0; ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; buf = (struct scsi_unmap_desc *)ptrlen->ptr; end = buf + ptrlen->len / sizeof(*buf); for (; buf < end; buf++) { len = (uint64_t)scsi_4btoul(buf->length) * be_lun->cbe_lun.blocksize; beio->io_len += len; ctl_be_block_unmap_dev_range(be_lun, beio, scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize, len, (end - buf < 2) ? TRUE : FALSE); } } else ctl_be_block_unmap_dev_range(be_lun, beio, beio->io_offset, beio->io_len, TRUE); } static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct bio *bio; struct cdevsw *csw; struct cdev *dev; off_t cur_offset; int i, max_iosize, ref; DPRINTF("entered\n"); csw = devvn_refthread(be_lun->vn, &dev, &ref); /* * We have to limit our I/O size to the maximum supported by the * backend device. Hopefully it is MAXPHYS. If the driver doesn't * set it properly, use DFLTPHYS. */ if (csw) { max_iosize = dev->si_iosize_max; if (max_iosize < PAGE_SIZE) max_iosize = DFLTPHYS; } else max_iosize = DFLTPHYS; cur_offset = beio->io_offset; for (i = 0; i < beio->num_segs; i++) { size_t cur_size; uint8_t *cur_ptr; cur_size = beio->sg_segs[i].len; cur_ptr = beio->sg_segs[i].addr; while (cur_size > 0) { /* This can't fail, it's a blocking allocation. */ bio = g_alloc_bio(); KASSERT(bio != NULL, ("g_alloc_bio() failed!\n")); bio->bio_cmd = beio->bio_cmd; bio->bio_dev = dev; bio->bio_caller1 = beio; bio->bio_length = min(cur_size, max_iosize); bio->bio_offset = cur_offset; bio->bio_data = cur_ptr; bio->bio_done = ctl_be_block_biodone; bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize; cur_offset += bio->bio_length; cur_ptr += bio->bio_length; cur_size -= bio->bio_length; TAILQ_INSERT_TAIL(&queue, bio, bio_queue); beio->num_bios_sent++; } } binuptime(&beio->ds_t0); mtx_lock(&be_lun->io_lock); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); beio->send_complete = 1; mtx_unlock(&be_lun->io_lock); /* * Fire off all allocated requests! */ while ((bio = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, bio, bio_queue); if (csw) csw->d_strategy(bio); else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } if (csw) dev_relthread(dev, ref); } static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname) { struct diocgattr_arg arg; struct cdevsw *csw; struct cdev *dev; int error, ref; csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) return (UINT64_MAX); strlcpy(arg.name, attrname, sizeof(arg.name)); arg.len = sizeof(arg.value.off); if (csw->d_ioctl) { error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD, curthread); } else error = ENODEV; dev_relthread(dev, ref); if (error != 0) return (UINT64_MAX); return (arg.value.off); } static void ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_lba_len_flags *lbalen; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; beio->io_len = lbalen->len * cbe_lun->blocksize; beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_arg = (lbalen->flags & SSC_IMMED) != 0; beio->bio_cmd = BIO_FLUSH; beio->ds_trans_type = DEVSTAT_NO_DATA; DPRINTF("SYNC\n"); be_lun->lun_flush(be_lun, beio); } static void ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); if ((io->io_hdr.flags & CTL_FLAG_ABORT) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { ctl_config_write_done(io); return; } ctl_be_block_config_write(io); } static void ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_lba_len_flags *lbalen; uint64_t len_left, lba; uint32_t pb, pbo, adj; int i, seglen; uint8_t *buf, *end; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; lbalen = ARGS(beio->io); if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) || (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) { ctl_free_beio(beio); ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 0, /*bit*/ 0); ctl_config_write_done(io); return; } if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) { beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize; beio->bio_cmd = BIO_DELETE; beio->ds_trans_type = DEVSTAT_FREE; be_lun->unmap(be_lun, beio); return; } beio->bio_cmd = BIO_WRITE; beio->ds_trans_type = DEVSTAT_WRITE; DPRINTF("WRITE SAME at LBA %jx len %u\n", (uintmax_t)lbalen->lba, lbalen->len); pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp; if (be_lun->cbe_lun.pblockoff > 0) pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff; else pbo = 0; len_left = (uint64_t)lbalen->len * cbe_lun->blocksize; for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) { /* * Setup the S/G entry for this chunk. */ seglen = MIN(CTLBLK_MAX_SEG, len_left); if (pb > cbe_lun->blocksize) { adj = ((lbalen->lba + lba) * cbe_lun->blocksize + seglen - pbo) % pb; if (seglen > adj) seglen -= adj; else seglen -= seglen % cbe_lun->blocksize; } else seglen -= seglen % cbe_lun->blocksize; beio->sg_segs[i].len = seglen; beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK); DPRINTF("segment %d addr %p len %zd\n", i, beio->sg_segs[i].addr, beio->sg_segs[i].len); beio->num_segs++; len_left -= seglen; buf = beio->sg_segs[i].addr; end = buf + seglen; for (; buf < end; buf += cbe_lun->blocksize) { if (lbalen->flags & SWS_NDOB) { memset(buf, 0, cbe_lun->blocksize); } else { memcpy(buf, io->scsiio.kern_data_ptr, cbe_lun->blocksize); } if (lbalen->flags & SWS_LBDATA) scsi_ulto4b(lbalen->lba + lba, buf); lba++; } } beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_len = lba * cbe_lun->blocksize; /* We can not do all in one run. Correct and schedule rerun. */ if (len_left > 0) { lbalen->lba += lba; lbalen->len -= lba; beio->beio_cont = ctl_be_block_cw_done_ws; } be_lun->dispatch(be_lun, beio); } static void ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_ptr_len_flags *ptrlen; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) { ctl_free_beio(beio); ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 0, /*command*/ 1, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); ctl_config_write_done(io); return; } beio->io_len = 0; beio->io_offset = -1; beio->bio_cmd = BIO_DELETE; beio->ds_trans_type = DEVSTAT_FREE; DPRINTF("UNMAP\n"); be_lun->unmap(be_lun, beio); } static void ctl_be_block_cr_done(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); ctl_config_read_done(io); } static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; DPRINTF("entered\n"); softc = be_lun->softc; beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; beio->beio_cont = ctl_be_block_cr_done; PRIV(io)->ptr = (void *)beio; switch (io->scsiio.cdb[0]) { case SERVICE_ACTION_IN: /* GET LBA STATUS */ beio->bio_cmd = -1; beio->ds_trans_type = DEVSTAT_NO_DATA; beio->ds_tag_type = DEVSTAT_TAG_ORDERED; beio->io_len = 0; if (be_lun->get_lba_status) be_lun->get_lba_status(be_lun, beio); else ctl_be_block_cr_done(beio); break; default: panic("Unhandled CDB type %#x", io->scsiio.cdb[0]); break; } } static void ctl_be_block_cw_done(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); ctl_config_write_done(io); } static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; DPRINTF("entered\n"); softc = be_lun->softc; beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; beio->beio_cont = ctl_be_block_cw_done; switch (io->scsiio.tag_type) { case CTL_TAG_ORDERED: beio->ds_tag_type = DEVSTAT_TAG_ORDERED; break; case CTL_TAG_HEAD_OF_QUEUE: beio->ds_tag_type = DEVSTAT_TAG_HEAD; break; case CTL_TAG_UNTAGGED: case CTL_TAG_SIMPLE: case CTL_TAG_ACA: default: beio->ds_tag_type = DEVSTAT_TAG_SIMPLE; break; } PRIV(io)->ptr = (void *)beio; switch (io->scsiio.cdb[0]) { case SYNCHRONIZE_CACHE: case SYNCHRONIZE_CACHE_16: ctl_be_block_cw_dispatch_sync(be_lun, io); break; case WRITE_SAME_10: case WRITE_SAME_16: ctl_be_block_cw_dispatch_ws(be_lun, io); break; case UNMAP: ctl_be_block_cw_dispatch_unmap(be_lun, io); break; default: panic("Unhandled CDB type %#x", io->scsiio.cdb[0]); break; } } -SDT_PROBE_DEFINE1(cbb, kernel, read, start, "uint64_t"); -SDT_PROBE_DEFINE1(cbb, kernel, write, start, "uint64_t"); -SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, "uint64_t"); -SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, "uint64_t"); +SDT_PROBE_DEFINE1(cbb, , read, start, "uint64_t"); +SDT_PROBE_DEFINE1(cbb, , write, start, "uint64_t"); +SDT_PROBE_DEFINE1(cbb, , read, alloc_done, "uint64_t"); +SDT_PROBE_DEFINE1(cbb, , write, alloc_done, "uint64_t"); static void ctl_be_block_next(struct ctl_be_block_io *beio) { struct ctl_be_block_lun *be_lun; union ctl_io *io; io = beio->io; be_lun = beio->lun; ctl_free_beio(beio); if ((io->io_hdr.flags & CTL_FLAG_ABORT) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { ctl_data_submit_done(io); return; } io->io_hdr.status &= ~CTL_STATUS_MASK; io->io_hdr.status |= CTL_STATUS_NONE; mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); } static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; struct ctl_lba_len_flags *lbalen; struct ctl_ptr_len_flags *bptrlen; uint64_t len_left, lbas; int i; softc = be_lun->softc; DPRINTF("entered\n"); lbalen = ARGS(io); if (lbalen->flags & CTL_LLF_WRITE) { - SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , write, start); } else { - SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , read, start); } beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; bptrlen = PRIV(io); bptrlen->ptr = (void *)beio; switch (io->scsiio.tag_type) { case CTL_TAG_ORDERED: beio->ds_tag_type = DEVSTAT_TAG_ORDERED; break; case CTL_TAG_HEAD_OF_QUEUE: beio->ds_tag_type = DEVSTAT_TAG_HEAD; break; case CTL_TAG_UNTAGGED: case CTL_TAG_SIMPLE: case CTL_TAG_ACA: default: beio->ds_tag_type = DEVSTAT_TAG_SIMPLE; break; } if (lbalen->flags & CTL_LLF_WRITE) { beio->bio_cmd = BIO_WRITE; beio->ds_trans_type = DEVSTAT_WRITE; } else { beio->bio_cmd = BIO_READ; beio->ds_trans_type = DEVSTAT_READ; } DPRINTF("%s at LBA %jx len %u @%ju\n", (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len); if (lbalen->flags & CTL_LLF_COMPARE) lbas = CTLBLK_HALF_IO_SIZE; else lbas = CTLBLK_MAX_IO_SIZE; lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize); beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize; beio->io_len = lbas * cbe_lun->blocksize; bptrlen->len += lbas; for (i = 0, len_left = beio->io_len; len_left > 0; i++) { KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)", i, CTLBLK_MAX_SEGS)); /* * Setup the S/G entry for this chunk. */ beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left); beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK); DPRINTF("segment %d addr %p len %zd\n", i, beio->sg_segs[i].addr, beio->sg_segs[i].len); /* Set up second segment for compare operation. */ if (lbalen->flags & CTL_LLF_COMPARE) { beio->sg_segs[i + CTLBLK_HALF_SEGS].len = beio->sg_segs[i].len; beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK); } beio->num_segs++; len_left -= beio->sg_segs[i].len; } if (bptrlen->len < lbalen->len) beio->beio_cont = ctl_be_block_next; io->scsiio.be_move_done = ctl_be_block_move_done; /* For compare we have separate S/G lists for read and datamove. */ if (lbalen->flags & CTL_LLF_COMPARE) io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS]; else io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs; io->scsiio.kern_data_len = beio->io_len; io->scsiio.kern_data_resid = 0; io->scsiio.kern_sg_entries = beio->num_segs; io->io_hdr.flags |= CTL_FLAG_ALLOCATED; /* * For the read case, we need to read the data into our buffers and * then we can send it back to the user. For the write case, we * need to get the data from the user first. */ if (beio->bio_cmd == BIO_READ) { - SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , read, alloc_done); be_lun->dispatch(be_lun, beio); } else { - SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0); + SDT_PROBE0(cbb, , write, alloc_done); #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_worker(void *context, int pending) { struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context; struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; union ctl_io *io; struct ctl_be_block_io *beio; DPRINTF("entered\n"); /* * Fetch and process I/Os from all queues. If we detect LUN * CTL_LUN_FLAG_NO_MEDIA status here -- it is result of a race, * so make response maximally opaque to not confuse initiator. */ for (;;) { mtx_lock(&be_lun->queue_lock); io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue); if (io != NULL) { DPRINTF("datamove queue\n"); STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_complete_beio(beio); return; } be_lun->dispatch(be_lun, beio); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue); if (io != NULL) { DPRINTF("config write queue\n"); STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_config_write_done(io); return; } ctl_be_block_cw_dispatch(be_lun, io); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue); if (io != NULL) { DPRINTF("config read queue\n"); STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_config_read_done(io); return; } ctl_be_block_cr_dispatch(be_lun, io); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue); if (io != NULL) { DPRINTF("input queue\n"); STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_data_submit_done(io); return; } ctl_be_block_dispatch(be_lun, io); continue; } /* * If we get here, there is no work left in the queues, so * just break out and let the task queue go to sleep. */ mtx_unlock(&be_lun->queue_lock); break; } } /* * Entry point from CTL to the backend for I/O. We queue everything to a * work thread, so this just puts the I/O on a queue and wakes up the * thread. */ static int ctl_be_block_submit(union ctl_io *io) { struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; DPRINTF("entered\n"); cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[ CTL_PRIV_BACKEND_LUN].ptr; be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun; /* * Make sure we only get SCSI I/O. */ KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type " "%#x) encountered", io->io_hdr.io_type)); PRIV(io)->len = 0; mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); return (CTL_RETVAL_COMPLETE); } static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct ctl_be_block_softc *softc; int error; softc = &backend_block_softc; error = 0; switch (cmd) { case CTL_LUN_REQ: { struct ctl_lun_req *lun_req; lun_req = (struct ctl_lun_req *)addr; switch (lun_req->reqtype) { case CTL_LUNREQ_CREATE: error = ctl_be_block_create(softc, lun_req); break; case CTL_LUNREQ_RM: error = ctl_be_block_rm(softc, lun_req); break; case CTL_LUNREQ_MODIFY: error = ctl_be_block_modify(softc, lun_req); break; default: lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "invalid LUN request type %d", lun_req->reqtype); break; } break; } default: error = ENOTTY; break; } return (error); } static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun; struct ctl_be_block_filedata *file_data; struct ctl_lun_create_params *params; char *value; struct vattr vattr; off_t ps, pss, po, pos, us, uss, uo, uos; int error; cbe_lun = &be_lun->cbe_lun; file_data = &be_lun->backend.file; params = &be_lun->params; be_lun->dev_type = CTL_BE_BLOCK_FILE; be_lun->dispatch = ctl_be_block_dispatch_file; be_lun->lun_flush = ctl_be_block_flush_file; be_lun->get_lba_status = ctl_be_block_gls_file; be_lun->getattr = ctl_be_block_getattr_file; be_lun->unmap = NULL; cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP; error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred); if (error != 0) { snprintf(req->error_str, sizeof(req->error_str), "error calling VOP_GETATTR() for file %s", be_lun->dev_path); return (error); } file_data->cred = crhold(curthread->td_ucred); if (params->lun_size_bytes != 0) be_lun->size_bytes = params->lun_size_bytes; else be_lun->size_bytes = vattr.va_size; /* * For files we can use any logical block size. Prefer 512 bytes * for compatibility reasons. If file's vattr.va_blocksize * (preferred I/O block size) is bigger and multiple to chosen * logical block size -- report it as physical block size. */ if (params->blocksize_bytes != 0) cbe_lun->blocksize = params->blocksize_bytes; else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = 2048; else cbe_lun->blocksize = 512; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); us = ps = vattr.va_blocksize; uo = po = 0; value = ctl_get_opt(&cbe_lun->options, "pblocksize"); if (value != NULL) ctl_expand_number(value, &ps); value = ctl_get_opt(&cbe_lun->options, "pblockoffset"); if (value != NULL) ctl_expand_number(value, &po); pss = ps / cbe_lun->blocksize; pos = po / cbe_lun->blocksize; if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) && ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) { cbe_lun->pblockexp = fls(pss) - 1; cbe_lun->pblockoff = (pss - pos) % pss; } value = ctl_get_opt(&cbe_lun->options, "ublocksize"); if (value != NULL) ctl_expand_number(value, &us); value = ctl_get_opt(&cbe_lun->options, "ublockoffset"); if (value != NULL) ctl_expand_number(value, &uo); uss = us / cbe_lun->blocksize; uos = uo / cbe_lun->blocksize; if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) && ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) { cbe_lun->ublockexp = fls(uss) - 1; cbe_lun->ublockoff = (uss - uos) % uss; } /* * Sanity check. The media size has to be at least one * sector long. */ if (be_lun->size_bytes < cbe_lun->blocksize) { error = EINVAL; snprintf(req->error_str, sizeof(req->error_str), "file %s size %ju < block size %u", be_lun->dev_path, (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize); } cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize; return (error); } static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_lun_create_params *params; struct cdevsw *csw; struct cdev *dev; char *value; int error, atomic, maxio, ref, unmap, tmp; off_t ps, pss, po, pos, us, uss, uo, uos, otmp; params = &be_lun->params; be_lun->dev_type = CTL_BE_BLOCK_DEV; csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) return (ENXIO); if (strcmp(csw->d_name, "zvol") == 0) { be_lun->dispatch = ctl_be_block_dispatch_zvol; be_lun->get_lba_status = ctl_be_block_gls_zvol; atomic = maxio = CTLBLK_MAX_IO_SIZE; } else { be_lun->dispatch = ctl_be_block_dispatch_dev; be_lun->get_lba_status = NULL; atomic = 0; maxio = dev->si_iosize_max; if (maxio <= 0) maxio = DFLTPHYS; if (maxio > CTLBLK_MAX_IO_SIZE) maxio = CTLBLK_MAX_IO_SIZE; } be_lun->lun_flush = ctl_be_block_flush_dev; be_lun->getattr = ctl_be_block_getattr_dev; be_lun->unmap = ctl_be_block_unmap_dev; if (!csw->d_ioctl) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "no d_ioctl for device %s!", be_lun->dev_path); return (ENODEV); } error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD, curthread); if (error) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "error %d returned for DIOCGSECTORSIZE ioctl " "on %s!", error, be_lun->dev_path); return (error); } /* * If the user has asked for a blocksize that is greater than the * backing device's blocksize, we can do it only if the blocksize * the user is asking for is an even multiple of the underlying * device's blocksize. */ if ((params->blocksize_bytes != 0) && (params->blocksize_bytes >= tmp)) { if (params->blocksize_bytes % tmp == 0) { cbe_lun->blocksize = params->blocksize_bytes; } else { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested blocksize %u is not an even " "multiple of backing device blocksize %u", params->blocksize_bytes, tmp); return (EINVAL); } } else if (params->blocksize_bytes != 0) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested blocksize %u < backing device " "blocksize %u", params->blocksize_bytes, tmp); return (EINVAL); } else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = MAX(tmp, 2048); else cbe_lun->blocksize = tmp; error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD, curthread); if (error) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "error %d returned for DIOCGMEDIASIZE " " ioctl on %s!", error, be_lun->dev_path); return (error); } if (params->lun_size_bytes != 0) { if (params->lun_size_bytes > otmp) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested LUN size %ju > backing device " "size %ju", (uintmax_t)params->lun_size_bytes, (uintmax_t)otmp); return (EINVAL); } be_lun->size_bytes = params->lun_size_bytes; } else be_lun->size_bytes = otmp; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD, curthread); if (error) ps = po = 0; else { error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po, FREAD, curthread); if (error) po = 0; } us = ps; uo = po; value = ctl_get_opt(&cbe_lun->options, "pblocksize"); if (value != NULL) ctl_expand_number(value, &ps); value = ctl_get_opt(&cbe_lun->options, "pblockoffset"); if (value != NULL) ctl_expand_number(value, &po); pss = ps / cbe_lun->blocksize; pos = po / cbe_lun->blocksize; if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) && ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) { cbe_lun->pblockexp = fls(pss) - 1; cbe_lun->pblockoff = (pss - pos) % pss; } value = ctl_get_opt(&cbe_lun->options, "ublocksize"); if (value != NULL) ctl_expand_number(value, &us); value = ctl_get_opt(&cbe_lun->options, "ublockoffset"); if (value != NULL) ctl_expand_number(value, &uo); uss = us / cbe_lun->blocksize; uos = uo / cbe_lun->blocksize; if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) && ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) { cbe_lun->ublockexp = fls(uss) - 1; cbe_lun->ublockoff = (uss - uos) % uss; } cbe_lun->atomicblock = atomic / cbe_lun->blocksize; cbe_lun->opttxferlen = maxio / cbe_lun->blocksize; if (be_lun->dispatch == ctl_be_block_dispatch_zvol) { unmap = 1; } else { struct diocgattr_arg arg; strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD, curthread); unmap = (error == 0) ? arg.value.i : 0; } value = ctl_get_opt(&cbe_lun->options, "unmap"); if (value != NULL) unmap = (strcmp(value, "on") == 0); if (unmap) cbe_lun->flags |= CTL_LUN_FLAG_UNMAP; else cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP; dev_relthread(dev, ref); return (0); } static int ctl_be_block_close(struct ctl_be_block_lun *be_lun) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; int flags; if (be_lun->vn) { flags = FREAD; if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0) flags |= FWRITE; (void)vn_close(be_lun->vn, flags, NOCRED, curthread); be_lun->vn = NULL; switch (be_lun->dev_type) { case CTL_BE_BLOCK_DEV: break; case CTL_BE_BLOCK_FILE: if (be_lun->backend.file.cred != NULL) { crfree(be_lun->backend.file.cred); be_lun->backend.file.cred = NULL; } break; case CTL_BE_BLOCK_NONE: break; default: panic("Unexpected backend type %d", be_lun->dev_type); break; } be_lun->dev_type = CTL_BE_BLOCK_NONE; } return (0); } static int ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct nameidata nd; char *value; int error, flags; error = 0; if (rootvnode == NULL) { snprintf(req->error_str, sizeof(req->error_str), "Root filesystem is not mounted"); return (1); } pwd_ensure_dirs(); value = ctl_get_opt(&cbe_lun->options, "file"); if (value == NULL) { snprintf(req->error_str, sizeof(req->error_str), "no file argument specified"); return (1); } free(be_lun->dev_path, M_CTLBLK); be_lun->dev_path = strdup(value, M_CTLBLK); flags = FREAD; value = ctl_get_opt(&cbe_lun->options, "readonly"); if (value != NULL) { if (strcmp(value, "on") != 0) flags |= FWRITE; } else if (cbe_lun->lun_type == T_DIRECT) flags |= FWRITE; again: NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread); error = vn_open(&nd, &flags, 0, NULL); if ((error == EROFS || error == EACCES) && (flags & FWRITE)) { flags &= ~FWRITE; goto again; } if (error) { /* * This is the only reasonable guess we can make as far as * path if the user doesn't give us a fully qualified path. * If they want to specify a file, they need to specify the * full path. */ if (be_lun->dev_path[0] != '/') { char *dev_name; asprintf(&dev_name, M_CTLBLK, "/dev/%s", be_lun->dev_path); free(be_lun->dev_path, M_CTLBLK); be_lun->dev_path = dev_name; goto again; } snprintf(req->error_str, sizeof(req->error_str), "error opening %s: %d", be_lun->dev_path, error); return (error); } if (flags & FWRITE) cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY; else cbe_lun->flags |= CTL_LUN_FLAG_READONLY; NDFREE(&nd, NDF_ONLY_PNBUF); be_lun->vn = nd.ni_vp; /* We only support disks and files. */ if (vn_isdisk(be_lun->vn, &error)) { error = ctl_be_block_open_dev(be_lun, req); } else if (be_lun->vn->v_type == VREG) { error = ctl_be_block_open_file(be_lun, req); } else { error = EINVAL; snprintf(req->error_str, sizeof(req->error_str), "%s is not a disk or plain file", be_lun->dev_path); } VOP_UNLOCK(be_lun->vn, 0); if (error != 0) ctl_be_block_close(be_lun); cbe_lun->serseq = CTL_LUN_SERSEQ_OFF; if (be_lun->dispatch != ctl_be_block_dispatch_dev) cbe_lun->serseq = CTL_LUN_SERSEQ_READ; value = ctl_get_opt(&cbe_lun->options, "serseq"); if (value != NULL && strcmp(value, "on") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_ON; else if (value != NULL && strcmp(value, "read") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_READ; else if (value != NULL && strcmp(value, "off") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_OFF; return (0); } static int ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun; struct ctl_be_block_lun *be_lun; struct ctl_lun_create_params *params; char num_thread_str[16]; char tmpstr[32]; char *value; int retval, num_threads; int tmp_num_threads; params = &req->reqdata.create; retval = 0; req->status = CTL_LUN_OK; be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK); cbe_lun = &be_lun->cbe_lun; cbe_lun->be_lun = be_lun; be_lun->params = req->reqdata.create; be_lun->softc = softc; STAILQ_INIT(&be_lun->input_queue); STAILQ_INIT(&be_lun->config_read_queue); STAILQ_INIT(&be_lun->config_write_queue); STAILQ_INIT(&be_lun->datamove_queue); sprintf(be_lun->lunname, "cblk%d", softc->num_luns); mtx_init(&be_lun->io_lock, "cblk io lock", NULL, MTX_DEF); mtx_init(&be_lun->queue_lock, "cblk queue lock", NULL, MTX_DEF); ctl_init_opts(&cbe_lun->options, req->num_be_args, req->kern_be_args); be_lun->lun_zone = uma_zcreate(be_lun->lunname, CTLBLK_MAX_SEG, NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0); if (be_lun->lun_zone == NULL) { snprintf(req->error_str, sizeof(req->error_str), "error allocating UMA zone"); goto bailout_error; } if (params->flags & CTL_LUN_FLAG_DEV_TYPE) cbe_lun->lun_type = params->device_type; else cbe_lun->lun_type = T_DIRECT; be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED; cbe_lun->flags = 0; value = ctl_get_opt(&cbe_lun->options, "ha_role"); if (value != NULL) { if (strcmp(value, "primary") == 0) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; } else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; if (cbe_lun->lun_type == T_DIRECT || cbe_lun->lun_type == T_CDROM) { be_lun->size_bytes = params->lun_size_bytes; if (params->blocksize_bytes != 0) cbe_lun->blocksize = params->blocksize_bytes; else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = 2048; else cbe_lun->blocksize = 512; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) || control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) { retval = ctl_be_block_open(be_lun, req); if (retval != 0) { retval = 0; req->status = CTL_LUN_WARNING; } } num_threads = cbb_num_threads; } else { num_threads = 1; } value = ctl_get_opt(&cbe_lun->options, "num_threads"); if (value != NULL) { tmp_num_threads = strtol(value, NULL, 0); /* * We don't let the user specify less than one * thread, but hope he's clueful enough not to * specify 1000 threads. */ if (tmp_num_threads < 1) { snprintf(req->error_str, sizeof(req->error_str), "invalid number of threads %s", num_thread_str); goto bailout_error; } num_threads = tmp_num_threads; } if (be_lun->vn == NULL) cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; /* Tell the user the blocksize we ended up using */ params->lun_size_bytes = be_lun->size_bytes; params->blocksize_bytes = cbe_lun->blocksize; if (params->flags & CTL_LUN_FLAG_ID_REQ) { cbe_lun->req_lun_id = params->req_lun_id; cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ; } else cbe_lun->req_lun_id = 0; cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown; cbe_lun->lun_config_status = ctl_be_block_lun_config_status; cbe_lun->be = &ctl_be_block_driver; if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) { snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d", softc->num_luns); strncpy((char *)cbe_lun->serial_num, tmpstr, MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr))); /* Tell the user what we used for a serial number */ strncpy((char *)params->serial_num, tmpstr, MIN(sizeof(params->serial_num), sizeof(tmpstr))); } else { strncpy((char *)cbe_lun->serial_num, params->serial_num, MIN(sizeof(cbe_lun->serial_num), sizeof(params->serial_num))); } if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) { snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns); strncpy((char *)cbe_lun->device_id, tmpstr, MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr))); /* Tell the user what we used for a device ID */ strncpy((char *)params->device_id, tmpstr, MIN(sizeof(params->device_id), sizeof(tmpstr))); } else { strncpy((char *)cbe_lun->device_id, params->device_id, MIN(sizeof(cbe_lun->device_id), sizeof(params->device_id))); } TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun); be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK, taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue); if (be_lun->io_taskqueue == NULL) { snprintf(req->error_str, sizeof(req->error_str), "unable to create taskqueue"); goto bailout_error; } /* * Note that we start the same number of threads by default for * both the file case and the block device case. For the file * case, we need multiple threads to allow concurrency, because the * vnode interface is designed to be a blocking interface. For the * block device case, ZFS zvols at least will block the caller's * context in many instances, and so we need multiple threads to * overcome that problem. Other block devices don't need as many * threads, but they shouldn't cause too many problems. * * If the user wants to just have a single thread for a block * device, he can specify that when the LUN is created, or change * the tunable/sysctl to alter the default number of threads. */ retval = taskqueue_start_threads(&be_lun->io_taskqueue, /*num threads*/num_threads, /*priority*/PWAIT, /*thread name*/ "%s taskq", be_lun->lunname); if (retval != 0) goto bailout_error; be_lun->num_threads = num_threads; mtx_lock(&softc->lock); softc->num_luns++; STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links); mtx_unlock(&softc->lock); retval = ctl_add_lun(&be_lun->cbe_lun); if (retval != 0) { mtx_lock(&softc->lock); STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links); softc->num_luns--; mtx_unlock(&softc->lock); snprintf(req->error_str, sizeof(req->error_str), "ctl_add_lun() returned error %d, see dmesg for " "details", retval); retval = 0; goto bailout_error; } mtx_lock(&softc->lock); /* * Tell the config_status routine that we're waiting so it won't * clean up the LUN in the event of an error. */ be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING; while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) { retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0); if (retval == EINTR) break; } be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING; if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) { snprintf(req->error_str, sizeof(req->error_str), "LUN configuration error, see dmesg for details"); STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links); softc->num_luns--; mtx_unlock(&softc->lock); goto bailout_error; } else { params->req_lun_id = cbe_lun->lun_id; } mtx_unlock(&softc->lock); be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id, cbe_lun->blocksize, DEVSTAT_ALL_SUPPORTED, cbe_lun->lun_type | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); return (retval); bailout_error: req->status = CTL_LUN_ERROR; if (be_lun->io_taskqueue != NULL) taskqueue_free(be_lun->io_taskqueue); ctl_be_block_close(be_lun); if (be_lun->dev_path != NULL) free(be_lun->dev_path, M_CTLBLK); if (be_lun->lun_zone != NULL) uma_zdestroy(be_lun->lun_zone); ctl_free_opts(&cbe_lun->options); mtx_destroy(&be_lun->queue_lock); mtx_destroy(&be_lun->io_lock); free(be_lun, M_CTLBLK); return (retval); } static int ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_lun_rm_params *params; struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; int retval; params = &req->reqdata.rm; mtx_lock(&softc->lock); STAILQ_FOREACH(be_lun, &softc->lun_list, links) { if (be_lun->cbe_lun.lun_id == params->lun_id) break; } mtx_unlock(&softc->lock); if (be_lun == NULL) { snprintf(req->error_str, sizeof(req->error_str), "LUN %u is not managed by the block backend", params->lun_id); goto bailout_error; } cbe_lun = &be_lun->cbe_lun; retval = ctl_disable_lun(cbe_lun); if (retval != 0) { snprintf(req->error_str, sizeof(req->error_str), "error %d returned from ctl_disable_lun() for " "LUN %d", retval, params->lun_id); goto bailout_error; } if (be_lun->vn != NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); taskqueue_drain_all(be_lun->io_taskqueue); ctl_be_block_close(be_lun); } retval = ctl_invalidate_lun(cbe_lun); if (retval != 0) { snprintf(req->error_str, sizeof(req->error_str), "error %d returned from ctl_invalidate_lun() for " "LUN %d", retval, params->lun_id); goto bailout_error; } mtx_lock(&softc->lock); be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING; while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) { retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0); if (retval == EINTR) break; } be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING; if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) { snprintf(req->error_str, sizeof(req->error_str), "interrupted waiting for LUN to be freed"); mtx_unlock(&softc->lock); goto bailout_error; } STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links); softc->num_luns--; mtx_unlock(&softc->lock); taskqueue_drain_all(be_lun->io_taskqueue); taskqueue_free(be_lun->io_taskqueue); if (be_lun->disk_stats != NULL) devstat_remove_entry(be_lun->disk_stats); uma_zdestroy(be_lun->lun_zone); ctl_free_opts(&cbe_lun->options); free(be_lun->dev_path, M_CTLBLK); mtx_destroy(&be_lun->queue_lock); mtx_destroy(&be_lun->io_lock); free(be_lun, M_CTLBLK); req->status = CTL_LUN_OK; return (0); bailout_error: req->status = CTL_LUN_ERROR; return (0); } static int ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_lun_modify_params *params; struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; char *value; uint64_t oldsize; int error, wasprim; params = &req->reqdata.modify; mtx_lock(&softc->lock); STAILQ_FOREACH(be_lun, &softc->lun_list, links) { if (be_lun->cbe_lun.lun_id == params->lun_id) break; } mtx_unlock(&softc->lock); if (be_lun == NULL) { snprintf(req->error_str, sizeof(req->error_str), "LUN %u is not managed by the block backend", params->lun_id); goto bailout_error; } cbe_lun = &be_lun->cbe_lun; if (params->lun_size_bytes != 0) be_lun->params.lun_size_bytes = params->lun_size_bytes; ctl_update_opts(&cbe_lun->options, req->num_be_args, req->kern_be_args); wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY); value = ctl_get_opt(&cbe_lun->options, "ha_role"); if (value != NULL) { if (strcmp(value, "primary") == 0) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; else cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY; } else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; else cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY; if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) { if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ctl_lun_primary(cbe_lun); else ctl_lun_secondary(cbe_lun); } oldsize = be_lun->size_blocks; if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) || control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) { if (be_lun->vn == NULL) error = ctl_be_block_open(be_lun, req); else if (vn_isdisk(be_lun->vn, &error)) error = ctl_be_block_open_dev(be_lun, req); else if (be_lun->vn->v_type == VREG) { vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); error = ctl_be_block_open_file(be_lun, req); VOP_UNLOCK(be_lun->vn, 0); } else error = EINVAL; if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) && be_lun->vn != NULL) { cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA; ctl_lun_has_media(cbe_lun); } else if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) == 0 && be_lun->vn == NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); } cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED; } else { if (be_lun->vn != NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); taskqueue_drain_all(be_lun->io_taskqueue); error = ctl_be_block_close(be_lun); } else error = 0; } if (be_lun->size_blocks != oldsize) ctl_lun_capacity_changed(cbe_lun); /* Tell the user the exact size we ended up using */ params->lun_size_bytes = be_lun->size_bytes; req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK; return (0); bailout_error: req->status = CTL_LUN_ERROR; return (0); } static void ctl_be_block_lun_shutdown(void *be_lun) { struct ctl_be_block_lun *lun; struct ctl_be_block_softc *softc; lun = (struct ctl_be_block_lun *)be_lun; softc = lun->softc; mtx_lock(&softc->lock); lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED; if (lun->flags & CTL_BE_BLOCK_LUN_WAITING) wakeup(lun); mtx_unlock(&softc->lock); } static void ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status) { struct ctl_be_block_lun *lun; struct ctl_be_block_softc *softc; lun = (struct ctl_be_block_lun *)be_lun; softc = lun->softc; if (status == CTL_LUN_CONFIG_OK) { mtx_lock(&softc->lock); lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED; if (lun->flags & CTL_BE_BLOCK_LUN_WAITING) wakeup(lun); mtx_unlock(&softc->lock); /* * We successfully added the LUN, attempt to enable it. */ if (ctl_enable_lun(&lun->cbe_lun) != 0) { printf("%s: ctl_enable_lun() failed!\n", __func__); if (ctl_invalidate_lun(&lun->cbe_lun) != 0) { printf("%s: ctl_invalidate_lun() failed!\n", __func__); } } return; } mtx_lock(&softc->lock); lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED; lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR; wakeup(lun); mtx_unlock(&softc->lock); } static int ctl_be_block_config_write(union ctl_io *io) { struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; int retval; DPRINTF("entered\n"); cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[ CTL_PRIV_BACKEND_LUN].ptr; be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun; retval = 0; switch (io->scsiio.cdb[0]) { case SYNCHRONIZE_CACHE: case SYNCHRONIZE_CACHE_16: case WRITE_SAME_10: case WRITE_SAME_16: case UNMAP: /* * The upper level CTL code will filter out any CDBs with * the immediate bit set and return the proper error. * * We don't really need to worry about what LBA range the * user asked to be synced out. When they issue a sync * cache command, we'll sync out the whole thing. */ mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); break; case START_STOP_UNIT: { struct scsi_start_stop_unit *cdb; struct ctl_lun_req req; cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb; if ((cdb->how & SSS_PC_MASK) != 0) { ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; } if (cdb->how & SSS_START) { if ((cdb->how & SSS_LOEJ) && be_lun->vn == NULL) { retval = ctl_be_block_open(be_lun, &req); cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED; if (retval == 0) { cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA; ctl_lun_has_media(cbe_lun); } else { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); } } ctl_start_lun(cbe_lun); } else { ctl_stop_lun(cbe_lun); if (cdb->how & SSS_LOEJ) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; cbe_lun->flags |= CTL_LUN_FLAG_EJECTED; ctl_lun_ejected(cbe_lun); if (be_lun->vn != NULL) ctl_be_block_close(be_lun); } } ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; } case PREVENT_ALLOW: ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; default: ctl_set_invalid_opcode(&io->scsiio); ctl_config_write_done(io); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } static int ctl_be_block_config_read(union ctl_io *io) { struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; int retval = 0; DPRINTF("entered\n"); cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[ CTL_PRIV_BACKEND_LUN].ptr; be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun; switch (io->scsiio.cdb[0]) { case SERVICE_ACTION_IN: if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) { mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->config_read_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); retval = CTL_RETVAL_QUEUED; break; } ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); ctl_config_read_done(io); retval = CTL_RETVAL_COMPLETE; break; default: ctl_set_invalid_opcode(&io->scsiio); ctl_config_read_done(io); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb) { struct ctl_be_block_lun *lun; int retval; lun = (struct ctl_be_block_lun *)be_lun; retval = sbuf_printf(sb, "\t"); if (retval != 0) goto bailout; retval = sbuf_printf(sb, "%d", lun->num_threads); if (retval != 0) goto bailout; retval = sbuf_printf(sb, "\n"); bailout: return (retval); } static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname) { struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)be_lun; if (lun->getattr == NULL) return (UINT64_MAX); return (lun->getattr(lun, attrname)); } int ctl_be_block_init(void) { struct ctl_be_block_softc *softc; int retval; softc = &backend_block_softc; retval = 0; mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF); beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); STAILQ_INIT(&softc->lun_list); return (retval); } Index: head/sys/compat/linux/linux_dtrace.h =================================================================== --- head/sys/compat/linux/linux_dtrace.h (revision 292383) +++ head/sys/compat/linux/linux_dtrace.h (revision 292384) @@ -1,90 +1,90 @@ /*- * Copyright (c) 2008-2012 Alexander Leidinger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_DTRACE_H_ #define _LINUX_DTRACE_H_ /** * DTrace support macros for the linuxulator. * * Some wrapper macros to make it more easy to handle the linuxulator * providers and to allow to make the name depend upon the bitsize. * * Basically this is the same as the normal SDT macros in sys/sdt.h. The * difference is that the provider name is automatically inserted, and * we do not use a different name for the probe-description. */ #define LIN_SDT_PROVIDER_DEFINE(x) SDT_PROVIDER_DEFINE(x) #define LIN_SDT_PROVIDER_DECLARE(x) SDT_PROVIDER_DECLARE(x) #define _LIN_SDT_PROBE_DECLARE(a, b, c, d) SDT_PROBE_DECLARE(a, b, c, d) #define LIN_SDT_PROBE_DECLARE(a, b, c) _LIN_SDT_PROBE_DECLARE( \ LINUX_DTRACE, a, b, c) #define _LIN_SDT_PROBE_DEFINE0(a, b, c, d) SDT_PROBE_DEFINE(a, \ b, c, d) #define LIN_SDT_PROBE_DEFINE0(a, b, c) _LIN_SDT_PROBE_DEFINE0(\ LINUX_DTRACE, a, b, c) #define _LIN_SDT_PROBE_DEFINE1(a, b, c, d, e) SDT_PROBE_DEFINE1(a, \ b, c, d, e) #define LIN_SDT_PROBE_DEFINE1(a, b, c, d) _LIN_SDT_PROBE_DEFINE1(\ LINUX_DTRACE, a, b, c, d) #define _LIN_SDT_PROBE_DEFINE2(a, b, c, d, e, f) SDT_PROBE_DEFINE2(a, \ b, c, d, e, f) #define LIN_SDT_PROBE_DEFINE2(a, b, c, d, e) _LIN_SDT_PROBE_DEFINE2(\ LINUX_DTRACE, a, b, c, d, e) #define _LIN_SDT_PROBE_DEFINE3(a, b, c, d, e, f, g) SDT_PROBE_DEFINE3(a, \ b, c, d, e, f, g) #define LIN_SDT_PROBE_DEFINE3(a, b, c, d, e, f) _LIN_SDT_PROBE_DEFINE3(\ LINUX_DTRACE, a, b, c, d, e, f) #define _LIN_SDT_PROBE_DEFINE4(a, b, c, d, e, f, g, h) SDT_PROBE_DEFINE4(a, \ b, c, d, e, f, g, h) #define LIN_SDT_PROBE_DEFINE4(a, b, c, d, e, f, g) _LIN_SDT_PROBE_DEFINE4(\ LINUX_DTRACE, a, b, c, d, e, f, g) #define _LIN_SDT_PROBE_DEFINE5(a, b, c, d, e, f, g, h, i) \ SDT_PROBE_DEFINE5(a, b, c, d, e, f, g, h, i) #define LIN_SDT_PROBE_DEFINE5(a, b, c, d, e, f, g, h) _LIN_SDT_PROBE_DEFINE5(\ LINUX_DTRACE, a, b, c, d, e, f, g, h) -#define LIN_SDT_PROBE0(a, b, c) SDT_PROBE1(LINUX_DTRACE, a, b, \ - c, 0) +#define LIN_SDT_PROBE0(a, b, c) SDT_PROBE0(LINUX_DTRACE, a, b, \ + c) #define LIN_SDT_PROBE1(a, b, c, d) SDT_PROBE1(LINUX_DTRACE, a, b, \ c, d) #define LIN_SDT_PROBE2(a, b, c, d, e) SDT_PROBE2(LINUX_DTRACE, a, b, \ c, d, e) #define LIN_SDT_PROBE3(a, b, c, d, e, f) SDT_PROBE3(LINUX_DTRACE, a, b, \ c, d, e, f) #define LIN_SDT_PROBE4(a, b, c, d, e, f, g) SDT_PROBE4(LINUX_DTRACE, a, b, \ c, d, e, f, g) #define _LIN_SDT_PROBE5(a, b, c, d, e, f, g, h, i) SDT_PROBE5(a, b, c, d, \ e, f, g, h, i) #define LIN_SDT_PROBE5(a, b, c, d, e, f, g, h) _LIN_SDT_PROBE5(LINUX_DTRACE, \ a, b, c, d, e, f, g, h) #endif /* _LINUX_DTRACE_H_ */ Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 292383) +++ head/sys/kern/kern_exec.c (revision 292384) @@ -1,1618 +1,1618 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); -SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *"); -SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int"); -SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *"); +SDT_PROBE_DEFINE1(proc, , , exec, "char *"); +SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); +SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); post_execve(td, error, oldvmspace); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == 0) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(p->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); return (do_execve(td, args, mac_p)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; cap_rights_t rights; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } - SDT_PROBE1(proc, kernel, , exec, args->fname); + SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, cap_rights_init(&rights, CAP_FEXECVE), &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); /* * Do the best to calculate the full path to the image file. */ if (imgp->auxargs != NULL && ((args->fname != NULL && args->fname[0] == '/') || vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0)) imgp->execpath = args->fname; if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } else { oldsigacts = NULL; newsigacts = NULL; /* satisfy gcc */ } PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; oldcred = p->p_ucred; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in compatibility mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); if (error != 0) goto done1; newcred = crdup(oldcred); euip = uifind(attr.va_uid); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); /* * Set the new credentials. */ if (attr.va_mode & S_ISUID) change_euid(newcred, euip); if (attr.va_mode & S_ISGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, newcred, imgp->vp, interpvplabel, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); proc_set_cred(p, newcred); } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); newcred = crdup(oldcred); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); proc_set_cred(p, newcred); } } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. * * The proc lock needs to be released before taking the PMC * SX. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } else PROC_UNLOCK(p); #else /* !HWPMC_HOOKS */ PROC_UNLOCK(p); #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); - SDT_PROBE1(proc, kernel, , exec__success, args->fname); + SDT_PROBE1(proc, , , exec__success, args->fname); VOP_UNLOCK(imgp->vp, 0); done1: /* * Free any resources malloc'd earlier that we didn't use. */ if (euip != NULL) uifree(euip); if (newcred != NULL) crfree(oldcred); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif vn_lock(imgp->vp, LK_SHARED | LK_RETRY); pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { PROC_LOCK(p); td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); - SDT_PROBE1(proc, kernel, , exec__failure, error); + SDT_PROBE1(proc, , , exec__failure, error); done2: #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i, after, initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_WLOCK(object); #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL); if (ma[0]->valid != VM_PAGE_BITS_ALL) { if (!vm_pager_has_page(object, 0, NULL, &after)) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); vm_page_xunbusy(ma[0]); VM_OBJECT_WUNLOCK(object); return (EIO); } initial_pagein = min(after, VM_INITIAL_PAGEIN); KASSERT(initial_pagein <= object->size, ("%s: initial_pagein %d object->size %ju", __func__, initial_pagein, (uintmax_t )object->size)); for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if (vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL); if (rv != VM_PAGER_OK) { for (i = 0; i < initial_pagein; i++) { vm_page_lock(ma[i]); vm_page_free(ma[i]); vm_page_unlock(ma[i]); vm_page_xunbusy(ma[i]); } VM_OBJECT_WUNLOCK(object); return (EIO); } for (i = 1; i < initial_pagein; i++) vm_page_readahead_finish(ma[i]); } vm_page_xunbusy(ma[0]); vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_activate(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error) { vm_object_deallocate(obj); return (error); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long argp, envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ for (;;) { error = fueword(argv++, &argp); if (error == -1) { error = EFAULT; goto err_exit; } if (argp == 0) break; error = copyinstr((void *)(uintptr_t)argp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &envp); if (error == -1) { error = EFAULT; goto err_exit; } if (envp == 0) break; error = copyinstr((void *)(uintptr_t)envp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } int exec_copyin_data_fds(struct thread *td, struct image_args *args, const void *data, size_t datalen, const int *fds, size_t fdslen) { struct filedesc *ofdp; const char *p; int *kfds; int error; memset(args, '\0', sizeof(*args)); ofdp = td->td_proc->p_fd; if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1) return (E2BIG); error = exec_alloc_args(args); if (error != 0) return (error); args->begin_argv = args->buf; args->stringspace = ARG_MAX; if (datalen > 0) { /* * Argument buffer has been provided. Copy it into the * kernel as a single string and add a terminating null * byte. */ error = copyin(data, args->begin_argv, datalen); if (error != 0) goto err_exit; args->begin_argv[datalen] = '\0'; args->endp = args->begin_argv + datalen + 1; args->stringspace -= datalen + 1; /* * Traditional argument counting. Count the number of * null bytes. */ for (p = args->begin_argv; p < args->endp; ++p) if (*p == '\0') ++args->argc; } else { /* No argument buffer provided. */ args->endp = args->begin_argv; } /* There are no environment variables. */ args->begin_envv = args->endp; /* Create new file descriptor table. */ kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); error = copyin(fds, kfds, fdslen * sizeof(int)); if (error != 0) { free(kfds, M_TEMP); goto err_exit; } error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); free(kfds, M_TEMP); if (error != 0) goto err_exit; return (0); err_exit: exec_free_args(args); return (error); } /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. Returns zero if the allocation succeeds * and ENOMEM otherwise. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX); return (args->buf != NULL ? 0 : ENOMEM); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { kmap_free_wakeup(exec_map, (vm_offset_t)args->buf, PATH_MAX + ARG_MAX); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } if (args->fdp != NULL) fdescfree_remapped(args->fdp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp; uintptr_t destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; imgp->execpathp = destp; copyout(imgp->execpath, (void *)destp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = destp; copyout(canary, (void *)destp, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = destp; copyout(pagesizes, (void *)destp, szps); imgp->pagesizeslen = szps; destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error, writecount; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) return (error); if (writecount != 0) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_exit.c =================================================================== --- head/sys/kern/kern_exit.c (revision 292383) +++ head/sys/kern/kern_exit.c (revision 292384) @@ -1,1379 +1,1379 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for acct_process() function prototype */ #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exit; #endif SDT_PROVIDER_DECLARE(proc); -SDT_PROBE_DEFINE1(proc, kernel, , exit, "int"); +SDT_PROBE_DEFINE1(proc, , , exit, "int"); /* Hook for NFS teardown procedure. */ void (*nlminfo_release_p)(struct proc *p); struct proc * proc_realparent(struct proc *child) { struct proc *p, *parent; sx_assert(&proctree_lock, SX_LOCKED); if ((child->p_treeflag & P_TREE_ORPHANED) == 0) { if (child->p_oppid == 0 || child->p_pptr->p_pid == child->p_oppid) parent = child->p_pptr; else parent = initproc; return (parent); } for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) { /* Cannot use LIST_PREV(), since the list head is not known. */ p = __containerof(p->p_orphan.le_prev, struct proc, p_orphan.le_next); KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0, ("missing P_ORPHAN %p", p)); } parent = __containerof(p->p_orphan.le_prev, struct proc, p_orphans.lh_first); return (parent); } void reaper_abandon_children(struct proc *p, bool exiting) { struct proc *p1, *p2, *ptmp; sx_assert(&proctree_lock, SX_LOCKED); KASSERT(p != initproc, ("reaper_abandon_children for initproc")); if ((p->p_treeflag & P_TREE_REAPER) == 0) return; p1 = p->p_reaper; LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) { LIST_REMOVE(p2, p_reapsibling); p2->p_reaper = p1; p2->p_reapsubtree = p->p_reapsubtree; LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling); if (exiting && p2->p_pptr == p) { PROC_LOCK(p2); proc_reparent(p2, p1); PROC_UNLOCK(p2); } } KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty")); p->p_treeflag &= ~P_TREE_REAPER; } static void clear_orphan(struct proc *p) { struct proc *p1; sx_assert(&proctree_lock, SA_XLOCKED); if ((p->p_treeflag & P_TREE_ORPHANED) == 0) return; if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) { p1 = LIST_NEXT(p, p_orphan); if (p1 != NULL) p1->p_treeflag |= P_TREE_FIRST_ORPHAN; p->p_treeflag &= ~P_TREE_FIRST_ORPHAN; } LIST_REMOVE(p, p_orphan); p->p_treeflag &= ~P_TREE_ORPHANED; } /* * exit -- death of process. */ void sys_sys_exit(struct thread *td, struct sys_exit_args *uap) { exit1(td, uap->rval, 0); /* NOTREACHED */ } /* * Exit: deallocate address space and other resources, change proc state to * zombie, and unlink proc from allproc and parent's lists. Save exit status * and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rval, int signo) { struct proc *p, *nq, *q, *t; struct thread *tdt; struct vnode *ttyvp = NULL; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(rval == 0 || signo == 0, ("exit1 rv %d sig %d", rval, signo)); p = td->td_proc; /* * XXX in case we're rebooting we just let init die in order to * work around an unsolved stack overflow seen very late during * shutdown on sparc64 when the gmirror worker process exists. */ if (p == initproc && rebooting == 0) { printf("init died (signal %d, exit %d)\n", signo, rval); panic("Going nowhere without my init!"); } /* * Deref SU mp, since the thread does not return to userspace. */ if (softdep_ast_cleanup != NULL) softdep_ast_cleanup(); /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); /* * First check if some other thread or external request got * here before us. If so, act appropriately: exit or suspend. * We must ensure that stop requests are handled before we set * P_WEXIT. */ thread_suspend_check(0); while (p->p_flag & P_HADTHREADS) { /* * Kill off the other threads. This requires * some co-operation from other parts of the kernel * so it may not be instantaneous. With this state set * any thread entering the kernel from userspace will * thread_exit() in trap(). Any thread attempting to * sleep will return immediately with EINTR or EWOULDBLOCK * which will hopefully force them to back out to userland * freeing resources as they go. Any thread attempting * to return to userland will thread_exit() from userret(). * thread_exit() will unsuspend us when the last of the * other threads exits. * If there is already a thread singler after resumption, * calling thread_single will fail; in that case, we just * re-check all suspension request, the thread should * either be suspended there or exit. */ if (!thread_single(p, SINGLE_EXIT)) /* * All other activity in this process is now * stopped. Threading support has been turned * off. */ break; /* * Recheck for new stop or suspend requests which * might appear while process lock was dropped in * thread_single(). */ thread_suspend_check(0); } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); racct_sub(p, RACCT_NTHR, 1); /* Let event handler change exit status */ p->p_xexit = rval; p->p_xsig = signo; /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have * released their reference to us. Note that if they have * requested S_EXIT stops we will block here until they ack * via PIOCCONT. */ _STOPEVENT(p, S_EXIT, 0); /* * Ignore any pending request to stop due to a stop signal. * Once P_WEXIT is set, future requests will be ignored as * well. */ p->p_flag &= ~P_STOPPED_SIG; KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped")); /* * Note that we are exiting and do another wakeup of anyone in * PIOCWAIT in case they aren't listening for S_EXIT stops or * decided to wait again after we told them we are exiting. */ p->p_flag |= P_WEXIT; wakeup(&p->p_stype); /* * Wait for any processes that have a hold on our vmspace to * release their reference. */ while (p->p_lock > 0) msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0); PROC_UNLOCK(p); /* Drain the limit callout while we don't have the proc locked */ callout_drain(&p->p_limco); #ifdef AUDIT /* * The Sun BSM exit token contains two components: an exit status as * passed to exit(), and a return value to indicate what sort of exit * it was. The exit status is WEXITSTATUS(rv), but it's not clear * what the return value is. */ AUDIT_ARG_EXIT(rval, 0); AUDIT_SYSCALL_EXIT(0, td); #endif /* Are we a task leader with peers? */ if (p->p_peers != NULL && p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); kern_psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } /* * Check if any loadable modules need anything done at process exit. * E.g. SYSV IPC stuff. * Event handler could change exit status. * XXX what if one of these generates an error? */ EVENTHANDLER_INVOKE(process_exit, p); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); stopprofclock(p); p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE); /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); KASSERT(!timevalisset(&p->p_realtimer.it_value), ("realtime timer is still armed")); } PROC_UNLOCK(p); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. */ funsetownlst(&p->p_sigiolst); /* * If this process has an nlminfo data area (for lockd), release it */ if (nlminfo_release_p != NULL && p->p_nlminfo != NULL) (*nlminfo_release_p)(p); /* * Close open files and release open-file table. * This may block! */ fdescfree(td); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Remove ourself from our leader's peer list and wake our leader. */ if (p->p_leader->p_peers != NULL) { mtx_lock(&ppeers_lock); if (p->p_leader->p_peers != NULL) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); } vmspace_exit(td); sx_xlock(&proctree_lock); if (SESS_LEADER(p)) { struct session *sp = p->p_session; struct tty *tp; /* * s_ttyp is not zero'd; we use this to indicate that * the session once had a controlling terminal. (for * logging and informational purposes) */ SESS_LOCK(sp); ttyvp = sp->s_ttyvp; tp = sp->s_ttyp; sp->s_ttyvp = NULL; sp->s_ttydp = NULL; sp->s_leader = NULL; SESS_UNLOCK(sp); /* * Signal foreground pgrp and revoke access to * controlling terminal if it has not been revoked * already. * * Because the TTY may have been revoked in the mean * time and could already have a new session associated * with it, make sure we don't send a SIGHUP to a * foreground process group that does not belong to this * session. */ if (tp != NULL) { tty_lock(tp); if (tp->t_session == sp) tty_signal_pgrp(tp, SIGHUP); tty_unlock(tp); } if (ttyvp != NULL) { sx_xunlock(&proctree_lock); if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) { VOP_REVOKE(ttyvp, REVOKEALL); VOP_UNLOCK(ttyvp, 0); } sx_xlock(&proctree_lock); } } fixjobc(p, p->p_pgrp, 0); sx_xunlock(&proctree_lock); (void)acct_process(td); /* Release the TTY now we've unlocked everything. */ if (ttyvp != NULL) vrele(ttyvp); #ifdef KTRACE ktrprocexit(td); #endif /* * Release reference to text vnode */ if (p->p_textvp != NULL) { vrele(p->p_textvp); p->p_textvp = NULL; } /* * Release our limits structure. */ lim_free(p->p_limit); p->p_limit = NULL; tidhash_remove(td); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); /* * Call machine-dependent code to release any * machine-dependent resources other than the address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid); /* * Reparent all children processes: * - traced ones to the original parent (or init if we are that parent) * - the rest to init */ sx_xlock(&proctree_lock); q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(q->p_reaper); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); PROC_LOCK(q); q->p_sigparent = SIGCHLD; if (!(q->p_flag & P_TRACED)) { proc_reparent(q, q->p_reaper); } else { /* * Traced processes are killed since their existence * means someone is screwing up. */ t = proc_realparent(q); if (t == p) { proc_reparent(q, q->p_reaper); } else { PROC_LOCK(t); proc_reparent(q, t); PROC_UNLOCK(t); } /* * Since q was found on our children list, the * proc_reparent() call moved q to the orphan * list due to present P_TRACED flag. Clear * orphan link for q now while q is locked. */ clear_orphan(q); q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE); FOREACH_THREAD_IN_PROC(q, tdt) tdt->td_dbgflags &= ~TDB_SUSPEND; kern_psignal(q, SIGKILL); } PROC_UNLOCK(q); } /* * Also get rid of our orphans. */ while ((q = LIST_FIRST(&p->p_orphans)) != NULL) { PROC_LOCK(q); CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid, q->p_pid); clear_orphan(q); PROC_UNLOCK(q); } /* Save exit status. */ PROC_LOCK(p); p->p_xthread = td; /* Tell the prison that we are gone. */ prison_proc_free(p->p_ucred->cr_prison); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exit if it * has declared an interest. */ if (dtrace_fasttrap_exit) dtrace_fasttrap_exit(p); #endif /* * Notify interested parties of our demise. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXIT); #ifdef KDTRACE_HOOKS int reason = CLD_EXITED; if (WCOREDUMP(signo)) reason = CLD_DUMPED; else if (WIFSIGNALED(signo)) reason = CLD_KILLED; - SDT_PROBE1(proc, kernel, , exit, reason); + SDT_PROBE1(proc, , , exit, reason); #endif /* * Just delete all entries in the p_klist. At this point we won't * report any more events, and there are nasty race conditions that * can beat us if we don't. */ knlist_clear(&p->p_klist, 1); /* * If this is a process with a descriptor, we may not need to deliver * a signal to the parent. proctree_lock is held over * procdesc_exit() to serialize concurrent calls to close() and * exit(). */ if (p->p_procdesc == NULL || procdesc_exit(p)) { /* * Notify parent that we're gone. If parent has the * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN, * notify process 1 instead (and hope it will handle this * situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, p->p_reaper); p->p_sigparent = SIGCHLD; PROC_LOCK(p->p_pptr); /* * Notify parent, so in case he was wait(2)ing or * executing waitpid(2) with our pid, he will * continue. */ wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) childproc_exited(p); else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) childproc_exited(p); else /* LINUX thread */ kern_psignal(p->p_pptr, p->p_sigparent); } } else PROC_LOCK(p->p_pptr); sx_xunlock(&proctree_lock); /* * The state PRS_ZOMBIE prevents other proesses from sending * signal to the process, to avoid memory leak, we free memory * for signal queue at the time when the state is set. */ sigqueue_flush(&p->p_sigqueue); sigqueue_flush(&td->td_sigqueue); /* * We have to wait until after acquiring all locks before * changing p_state. We need to avoid all possible context * switches (including ones from blocking on a mutex) while * marked as a zombie. We also have to set the zombie state * before we release the parent process' proc lock to avoid * a lost wakeup. So, we first call wakeup, then we grab the * sched lock, update the state, and release the parent process' * proc lock. */ wakeup(p->p_pptr); cv_broadcast(&p->p_pwait); sched_exit(p->p_pptr, td); umtx_thread_exit(td); PROC_SLOCK(p); p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); /* * Hopefully no one will try to deliver a signal to the process this * late in the game. */ knlist_destroy(&p->p_klist); /* * Save our children's rusage information in our exit rusage. */ PROC_STATLOCK(p); ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux); PROC_STATUNLOCK(p); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); } #ifndef _SYS_SYSPROTO_H_ struct abort2_args { char *why; int nargs; void **args; }; #endif int sys_abort2(struct thread *td, struct abort2_args *uap) { struct proc *p = td->td_proc; struct sbuf *sb; void *uargs[16]; int error, i, sig; /* * Do it right now so we can log either proper call of abort2(), or * note, that invalid argument was passed. 512 is big enough to * handle 16 arguments' descriptions with additional comments. */ sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN); sbuf_clear(sb); sbuf_printf(sb, "%s(pid %d uid %d) aborted: ", p->p_comm, p->p_pid, td->td_ucred->cr_uid); /* * Since we can't return from abort2(), send SIGKILL in cases, where * abort2() was called improperly */ sig = SIGKILL; /* Prevent from DoSes from user-space. */ if (uap->nargs < 0 || uap->nargs > 16) goto out; if (uap->nargs > 0) { if (uap->args == NULL) goto out; error = copyin(uap->args, uargs, uap->nargs * sizeof(void *)); if (error != 0) goto out; } /* * Limit size of 'reason' string to 128. Will fit even when * maximal number of arguments was chosen to be logged. */ if (uap->why != NULL) { error = sbuf_copyin(sb, uap->why, 128); if (error < 0) goto out; } else { sbuf_printf(sb, "(null)"); } if (uap->nargs > 0) { sbuf_printf(sb, "("); for (i = 0;i < uap->nargs; i++) sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]); sbuf_printf(sb, ")"); } /* * Final stage: arguments were proper, string has been * successfully copied from userspace, and copying pointers * from user-space succeed. */ sig = SIGABRT; out: if (sig == SIGKILL) { sbuf_trim(sb); sbuf_printf(sb, " (Reason text inaccessible)"); } sbuf_cat(sb, "\n"); sbuf_finish(sb); log(LOG_INFO, "%s", sbuf_data(sb)); sbuf_delete(sb); exit1(td, 0, sig); return (0); } #ifdef COMPAT_43 /* * The dirty work is handled by kern_wait(). */ int owait(struct thread *td, struct owait_args *uap __unused) { int error, status; error = kern_wait(td, WAIT_ANY, &status, 0, NULL); if (error == 0) td->td_retval[1] = status; return (error); } #endif /* COMPAT_43 */ /* * The dirty work is handled by kern_wait(). */ int sys_wait4(struct thread *td, struct wait4_args *uap) { struct rusage ru, *rup; int error, status; if (uap->rusage != NULL) rup = &ru; else rup = NULL; error = kern_wait(td, uap->pid, &status, uap->options, rup); if (uap->status != NULL && error == 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->rusage != NULL && error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int sys_wait6(struct thread *td, struct wait6_args *uap) { struct __wrusage wru, *wrup; siginfo_t si, *sip; idtype_t idtype; id_t id; int error, status; idtype = uap->idtype; id = uap->id; if (uap->wrusage != NULL) wrup = &wru; else wrup = NULL; if (uap->info != NULL) { sip = &si; bzero(sip, sizeof(*sip)); } else sip = NULL; /* * We expect all callers of wait6() to know about WEXITED and * WTRAPPED. */ error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip); if (uap->status != NULL && error == 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->wrusage != NULL && error == 0) error = copyout(&wru, uap->wrusage, sizeof(wru)); if (uap->info != NULL && error == 0) error = copyout(&si, uap->info, sizeof(si)); return (error); } /* * Reap the remains of a zombie process and optionally return status and * rusage. Asserts and will release both the proctree_lock and the process * lock as part of its work. */ void proc_reap(struct thread *td, struct proc *p, int *status, int options) { struct proc *q, *t; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE")); q = td->td_proc; PROC_SUNLOCK(p); if (status) *status = KW_EXITCODE(p->p_xexit, p->p_xsig); if (options & WNOWAIT) { /* * Only poll, returning the status. Caller does not wish to * release the proc struct just yet. */ PROC_UNLOCK(p); sx_xunlock(&proctree_lock); return; } PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); /* * If we got the child via a ptrace 'attach', we need to give it back * to the old parent. */ if (p->p_oppid != 0 && p->p_oppid != p->p_pptr->p_pid) { PROC_UNLOCK(p); t = proc_realparent(p); PROC_LOCK(t); PROC_LOCK(p); CTR2(KTR_PTRACE, "wait: traced child %d moved back to parent %d", p->p_pid, t->p_pid); proc_reparent(p, t); p->p_oppid = 0; PROC_UNLOCK(p); pksignal(t, SIGCHLD, p->p_ksi); wakeup(t); cv_broadcast(&p->p_pwait); PROC_UNLOCK(t); sx_xunlock(&proctree_lock); return; } p->p_oppid = 0; PROC_UNLOCK(p); /* * Remove other references to this process to ensure we have an * exclusive reference. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); /* off zombproc */ sx_xunlock(&allproc_lock); LIST_REMOVE(p, p_sibling); reaper_abandon_children(p, true); LIST_REMOVE(p, p_reapsibling); PROC_LOCK(p); clear_orphan(p); PROC_UNLOCK(p); leavepgrp(p); if (p->p_procdesc != NULL) procdesc_reap(p); sx_xunlock(&proctree_lock); /* * Removal from allproc list and process group list paired with * PROC_LOCK which was executed during that time should guarantee * nothing can reach this process anymore. As such further locking * is unnecessary. */ p->p_xexit = p->p_xsig = 0; /* XXX: why? */ PROC_LOCK(q); ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux); PROC_UNLOCK(q); /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); /* * Destroy resource accounting information associated with the process. */ #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } #endif racct_proc_exit(p); /* * Free credentials, arguments, and sigacts. */ crfree(p->p_ucred); proc_set_cred(p, NULL); pargs_drop(p->p_args); p->p_args = NULL; sigacts_free(p->p_sigacts); p->p_sigacts = NULL; /* * Do any thread-system specific cleanups. */ thread_wait(p); /* * Give vm and machine-dependent layer a chance to free anything that * cpu_exit couldn't release while still running in process context. */ vm_waitproc(p); #ifdef MAC mac_proc_destroy(p); #endif /* * Free any domain policy that's still hiding around. */ vm_domain_policy_cleanup(&p->p_vm_dom_policy); KASSERT(FIRST_THREAD_IN_PROC(p), ("proc_reap: no residual thread!")); uma_zfree(proc_zone, p); atomic_add_int(&nprocs, -1); } static int proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id, int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo, int check_only) { struct rusage *rup; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK(p); switch (idtype) { case P_ALL: if (p->p_procdesc != NULL) { PROC_UNLOCK(p); return (0); } break; case P_PID: if (p->p_pid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_PGID: if (p->p_pgid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_SID: if (p->p_session->s_sid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_UID: if (p->p_ucred->cr_uid != (uid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_GID: if (p->p_ucred->cr_gid != (gid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_JAILID: if (p->p_ucred->cr_prison->pr_id != (int)id) { PROC_UNLOCK(p); return (0); } break; /* * It seems that the thread structures get zeroed out * at process exit. This makes it impossible to * support P_SETID, P_CID or P_CPUID. */ default: PROC_UNLOCK(p); return (0); } if (p_canwait(td, p)) { PROC_UNLOCK(p); return (0); } if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) { PROC_UNLOCK(p); return (0); } /* * This special case handles a kthread spawned by linux_clone * (see linux_misc.c). The linux_wait4 and linux_waitpid * functions need to be able to distinguish between waiting * on a process and waiting on a thread. It is a thread if * p_sigparent is not SIGCHLD, and the WLINUXCLONE option * signifies we want to wait for threads and not processes. */ if ((p->p_sigparent != SIGCHLD) ^ ((options & WLINUXCLONE) != 0)) { PROC_UNLOCK(p); return (0); } if (siginfo != NULL) { bzero(siginfo, sizeof(*siginfo)); siginfo->si_errno = 0; /* * SUSv4 requires that the si_signo value is always * SIGCHLD. Obey it despite the rfork(2) interface * allows to request other signal for child exit * notification. */ siginfo->si_signo = SIGCHLD; /* * This is still a rough estimate. We will fix the * cases TRAPPED, STOPPED, and CONTINUED later. */ if (WCOREDUMP(p->p_xsig)) { siginfo->si_code = CLD_DUMPED; siginfo->si_status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { siginfo->si_code = CLD_KILLED; siginfo->si_status = WTERMSIG(p->p_xsig); } else { siginfo->si_code = CLD_EXITED; siginfo->si_status = p->p_xexit; } siginfo->si_pid = p->p_pid; siginfo->si_uid = p->p_ucred->cr_uid; /* * The si_addr field would be useful additional * detail, but apparently the PC value may be lost * when we reach this point. bzero() above sets * siginfo->si_addr to NULL. */ } /* * There should be no reason to limit resources usage info to * exited processes only. A snapshot about any resources used * by a stopped process may be exactly what is needed. */ if (wrusage != NULL) { rup = &wrusage->wru_self; *rup = p->p_ru; PROC_STATLOCK(p); calcru(p, &rup->ru_utime, &rup->ru_stime); PROC_STATUNLOCK(p); rup = &wrusage->wru_children; *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); } if (p->p_state == PRS_ZOMBIE && !check_only) { PROC_SLOCK(p); proc_reap(td, p, status, options); return (-1); } PROC_UNLOCK(p); return (1); } int kern_wait(struct thread *td, pid_t pid, int *status, int options, struct rusage *rusage) { struct __wrusage wru, *wrup; idtype_t idtype; id_t id; int ret; /* * Translate the special pid values into the (idtype, pid) * pair for kern_wait6. The WAIT_MYPGRP case is handled by * kern_wait6() on its own. */ if (pid == WAIT_ANY) { idtype = P_ALL; id = 0; } else if (pid < 0) { idtype = P_PGID; id = (id_t)-pid; } else { idtype = P_PID; id = (id_t)pid; } if (rusage != NULL) wrup = &wru; else wrup = NULL; /* * For backward compatibility we implicitly add flags WEXITED * and WTRAPPED here. */ options |= WEXITED | WTRAPPED; ret = kern_wait6(td, idtype, id, status, options, wrup, NULL); if (rusage != NULL) *rusage = wru.wru_self; return (ret); } int kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo) { struct proc *p, *q; pid_t pid; int error, nfound, ret; AUDIT_ARG_VALUE((int)idtype); /* XXX - This is likely wrong! */ AUDIT_ARG_PID((pid_t)id); /* XXX - This may be wrong! */ AUDIT_ARG_VALUE(options); q = td->td_proc; if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) { PROC_LOCK(q); id = (id_t)q->p_pgid; PROC_UNLOCK(q); idtype = P_PGID; } /* If we don't know the option, just return. */ if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT | WEXITED | WTRAPPED | WLINUXCLONE)) != 0) return (EINVAL); if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) { /* * We will be unable to find any matching processes, * because there are no known events to look for. * Prefer to return error instead of blocking * indefinitely. */ return (EINVAL); } loop: if (q->p_flag & P_STATCHILD) { PROC_LOCK(q); q->p_flag &= ~P_STATCHILD; PROC_UNLOCK(q); } nfound = 0; sx_xlock(&proctree_lock); LIST_FOREACH(p, &q->p_children, p_sibling) { pid = p->p_pid; ret = proc_to_reap(td, p, idtype, id, status, options, wrusage, siginfo, 0); if (ret == 0) continue; else if (ret == 1) nfound++; else { td->td_retval[0] = pid; return (0); } PROC_LOCK(p); PROC_SLOCK(p); if ((options & WTRAPPED) != 0 && (p->p_flag & P_TRACED) != 0 && (p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0 && (p->p_suspcount == p->p_numthreads) && ((p->p_flag & P_WAITED) == 0)) { PROC_SUNLOCK(p); if ((options & WNOWAIT) == 0) p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); if (status != NULL) *status = W_STOPCODE(p->p_xsig); if (siginfo != NULL) { siginfo->si_status = p->p_xsig; siginfo->si_code = CLD_TRAPPED; } if ((options & WNOWAIT) == 0) { PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } CTR4(KTR_PTRACE, "wait: returning trapped pid %d status %#x (xstat %d) xthread %d", p->p_pid, W_STOPCODE(p->p_xsig), p->p_xsig, p->p_xthread != NULL ? p->p_xthread->td_tid : -1); PROC_UNLOCK(p); td->td_retval[0] = pid; return (0); } if ((options & WUNTRACED) != 0 && (p->p_flag & P_STOPPED_SIG) != 0 && (p->p_suspcount == p->p_numthreads) && ((p->p_flag & P_WAITED) == 0)) { PROC_SUNLOCK(p); if ((options & WNOWAIT) == 0) p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); if (status != NULL) *status = W_STOPCODE(p->p_xsig); if (siginfo != NULL) { siginfo->si_status = p->p_xsig; siginfo->si_code = CLD_STOPPED; } if ((options & WNOWAIT) == 0) { PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } PROC_UNLOCK(p); td->td_retval[0] = pid; return (0); } PROC_SUNLOCK(p); if ((options & WCONTINUED) != 0 && (p->p_flag & P_CONTINUED) != 0) { sx_xunlock(&proctree_lock); if ((options & WNOWAIT) == 0) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } PROC_UNLOCK(p); if (status != NULL) *status = SIGCONT; if (siginfo != NULL) { siginfo->si_status = SIGCONT; siginfo->si_code = CLD_CONTINUED; } td->td_retval[0] = pid; return (0); } PROC_UNLOCK(p); } /* * Look in the orphans list too, to allow the parent to * collect it's child exit status even if child is being * debugged. * * Debugger detaches from the parent upon successful * switch-over from parent to child. At this point due to * re-parenting the parent loses the child to debugger and a * wait4(2) call would report that it has no children to wait * for. By maintaining a list of orphans we allow the parent * to successfully wait until the child becomes a zombie. */ if (nfound == 0) { LIST_FOREACH(p, &q->p_orphans, p_orphan) { ret = proc_to_reap(td, p, idtype, id, NULL, options, NULL, NULL, 1); if (ret != 0) { KASSERT(ret != -1, ("reaped an orphan (pid %d)", (int)td->td_retval[0])); nfound++; break; } } } if (nfound == 0) { sx_xunlock(&proctree_lock); return (ECHILD); } if (options & WNOHANG) { sx_xunlock(&proctree_lock); td->td_retval[0] = 0; return (0); } PROC_LOCK(q); sx_xunlock(&proctree_lock); if (q->p_flag & P_STATCHILD) { q->p_flag &= ~P_STATCHILD; error = 0; } else error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0); PROC_UNLOCK(q); if (error) return (error); goto loop; } /* * Make process 'parent' the new parent of process 'child'. * Must be called with an exclusive hold of proctree lock. */ void proc_reparent(struct proc *child, struct proc *parent) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; PROC_LOCK(child->p_pptr); sigqueue_take(child->p_ksi); PROC_UNLOCK(child->p_pptr); LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); clear_orphan(child); if (child->p_flag & P_TRACED) { if (LIST_EMPTY(&child->p_pptr->p_orphans)) { child->p_treeflag |= P_TREE_FIRST_ORPHAN; LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child, p_orphan); } else { LIST_INSERT_AFTER(LIST_FIRST(&child->p_pptr->p_orphans), child, p_orphan); } child->p_treeflag |= P_TREE_ORPHANED; } child->p_pptr = parent; } Index: head/sys/kern/kern_fork.c =================================================================== --- head/sys/kern/kern_fork.c (revision 292383) +++ head/sys/kern/kern_fork.c (revision 292384) @@ -1,1090 +1,1089 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_fork_func_t dtrace_fasttrap_fork; #endif SDT_PROVIDER_DECLARE(proc); -SDT_PROBE_DEFINE3(proc, kernel, , create, "struct proc *", - "struct proc *", "int"); +SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int"); #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif /* ARGSUSED */ int sys_fork(struct thread *td, struct fork_args *uap) { int error; struct proc *p2; error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0, NULL); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return (error); } /* ARGUSED */ int sys_pdfork(td, uap) struct thread *td; struct pdfork_args *uap; { int error, fd; struct proc *p2; /* * It is necessary to return fd by reference because 0 is a valid file * descriptor number, and the child needs to be able to distinguish * itself from the parent using the return value. */ error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2, &fd, uap->flags, NULL); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; error = copyout(&fd, uap->fdp, sizeof(fd)); } return (error); } /* ARGSUSED */ int sys_vfork(struct thread *td, struct vfork_args *uap) { int error, flags; struct proc *p2; flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; error = fork1(td, flags, 0, &p2, NULL, 0, NULL); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return (error); } int sys_rfork(struct thread *td, struct rfork_args *uap) { struct proc *p2; int error; /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); AUDIT_ARG_FFLAGS(uap->flags); error = fork1(td, uap->flags, 0, &p2, NULL, 0, NULL); if (error == 0) { td->td_retval[0] = p2 ? p2->p_pid : 0; td->td_retval[1] = 0; } return (error); } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid < 0 || pid > pid_max - 100) /* out of range */ pid = pid_max - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); static int fork_findpid(int flags) { struct proc *p; int trypid; static int pidchecked = 0; /* * Requires allproc_lock in order to iterate over the list * of processes, and proctree_lock to access p_pgrp. */ sx_assert(&allproc_lock, SX_LOCKED); sx_assert(&proctree_lock, SX_LOCKED); /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= pid_max) { trypid = trypid % pid_max; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. * * Avoid reuse of the process group id, session id or * the reaper subtree id. Note that for process group * and sessions, the amount of reserved pids is * limited by process limit. For the subtree ids, the * id is kept reserved only while there is a * non-reaped process in the subtree, so amount of * reserved pids is limited by process limit times * two. */ p = LIST_FIRST(&allproc); again: for (; p != NULL; p = LIST_NEXT(p, p_list)) { while (p->p_pid == trypid || p->p_reapsubtree == trypid || (p->p_pgrp != NULL && (p->p_pgrp->pg_id == trypid || (p->p_session != NULL && p->p_session->s_sid == trypid)))) { trypid++; if (trypid >= pidchecked) goto retry; } if (p->p_pid > trypid && pidchecked > p->p_pid) pidchecked = p->p_pid; if (p->p_pgrp != NULL) { if (p->p_pgrp->pg_id > trypid && pidchecked > p->p_pgrp->pg_id) pidchecked = p->p_pgrp->pg_id; if (p->p_session != NULL && p->p_session->s_sid > trypid && pidchecked > p->p_session->s_sid) pidchecked = p->p_session->s_sid; } } if (!doingzomb) { doingzomb = 1; p = LIST_FIRST(&zombproc); goto again; } } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; return (trypid); } static int fork_norfproc(struct thread *td, int flags) { int error; struct proc *p1; KASSERT((flags & RFPROC) == 0, ("fork_norfproc called with RFPROC set")); p1 = td->td_proc; if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); return (ERESTART); } PROC_UNLOCK(p1); } error = vm_forkproc(td, NULL, NULL, NULL, flags); if (error) goto fail; /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td->td_proc->p_fd, false); fdescfree(td); p1->p_fd = fdtmp; } /* * Unshare file descriptors (from parent). */ if (flags & RFFDG) fdunshare(td); fail: if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } return (error); } static void do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2, struct vmspace *vm2, int pdflags) { struct proc *p1, *pptr; int p2_held, trypid; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct sigacts *newsigacts; sx_assert(&proctree_lock, SX_SLOCKED); sx_assert(&allproc_lock, SX_XLOCKED); p2_held = 0; p1 = td->td_proc; trypid = fork_findpid(flags); sx_sunlock(&proctree_lock); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; AUDIT_ARG_PID(p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); allproc_gen++; LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); tidhash_add(td2); PROC_LOCK(p2); PROC_LOCK(p1); sx_xunlock(&allproc_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); /* Tell the prison that we exist. */ prison_proc_hold(p2->p_ucred->cr_prison); PROC_UNLOCK(p2); /* * Malloc things while we don't hold any locks. */ if (flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (flags & RFCFDG) { fd = fdinit(p1->p_fd, false); fdtol = NULL; } else if (flags & RFFDG) { fd = fdcopy(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((flags & RFTHREAD) != 0) { /* * Shared file descriptor table, and shared * process leaders. */ fdtol = p1->p_fdtol; FILEDESC_XLOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_XUNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and different * process leaders. */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name)); td2->td_sigstk = td->td_sigstk; td2->td_flags = TDF_INMEM; td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); thread_unlock(td); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC); p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); /* * Whilst the proc lock is held, copy the VM domain data out * using the VM domain method. */ vm_domain_policy_init(&p2->p_vm_dom_policy); vm_domain_policy_localcopy(&p2->p_vm_dom_policy, &p1->p_vm_dom_policy); if (flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (flags & RFTSIGZMB) p2->p_sigparent = RFTSIGNUM(flags); else if (flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; p2->p_textvp = p1->p_textvp; p2->p_fd = fd; p2->p_fdtol = fdtol; if (p1->p_flag2 & P2_INHERIT_PROTECTED) { p2->p_flag |= P_PROTECTED; p2->p_flag2 |= P2_INHERIT_PROTECTED; } /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); thread_cow_get_proc(td2, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* Bump references to the text vnode (for procfs). */ if (p2->p_textvp) vref(p2->p_textvp); /* * Set up linkage for kernel based threading. */ if ((flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); kern_psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= (td->td_pflags & TDP_ALTSTACK) | TDP_FORKING; SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if ((flags & RFNOWAIT) != 0) { pptr = p1->p_reaper; p2->p_reaper = pptr; } else { p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ? p1 : p1->p_reaper; pptr = p1; } p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_reaplist); LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1) p2->p_reapsubtree = p2->p_pid; sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); #ifdef KTRACE ktrprocfork(p1, p2); #endif /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, vm2, flags); if (flags == (RFFDG | RFPROC)) { PCPU_INC(cnt.v_forks); PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { PCPU_INC(cnt.v_vforks); PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { PCPU_INC(cnt.v_kthreads); PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { PCPU_INC(cnt.v_rforks); PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } /* * Associate the process descriptor with the process before anything * can happen that might cause that process to need the descriptor. * However, don't do this until after fork(2) can no longer fail. */ if (flags & RFPROCDESC) procdesc_new(p2, pdflags); /* * Both processes are set up, now check if any loadable modules want * to adjust anything. */ EVENTHANDLER_INVOKE(process_fork, p1, p2, flags); /* * Set the child start time and mark the process as being complete. */ PROC_LOCK(p2); PROC_LOCK(p1); microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the new process so that any * tracepoints inherited from the parent can be removed. We have to do * this only after p_state is PRS_NORMAL since the fasttrap module will * use pfind() later on. */ if ((flags & RFMEM) == 0 && dtrace_fasttrap_fork) dtrace_fasttrap_fork(p1, p2); #endif if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED | P_FOLLOWFORK)) { /* * Arrange for debugger to receive the fork event. * * We can report PL_FLAG_FORKED regardless of * P_FOLLOWFORK settings, but it does not make a sense * for runaway child. */ td->td_dbgflags |= TDB_FORK; td->td_dbg_forked = p2->p_pid; td2->td_dbgflags |= TDB_STOPATFORK; _PHOLD(p2); p2_held = 1; } if (flags & RFPPWAIT) { td->td_pflags |= TDP_RFPPWAIT; td->td_rfppwait_p = p2; } PROC_UNLOCK(p2); if ((flags & RFSTOPPED) == 0) { /* * If RFSTOPPED not requested, make child runnable and * add to run queue. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); } /* * Now can be swapped. */ _PRELE(p1); PROC_UNLOCK(p1); /* * Tell any interested parties about the new process. */ knote_fork(&p1->p_klist, p2->p_pid); - SDT_PROBE3(proc, kernel, , create, p2, p1, flags); + SDT_PROBE3(proc, , , create, p2, p1, flags); /* * Wait until debugger is attached to child. */ PROC_LOCK(p2); while ((td2->td_dbgflags & TDB_STOPATFORK) != 0) cv_wait(&p2->p_dbgwait, &p2->p_mtx); if (p2_held) _PRELE(p2); PROC_UNLOCK(p2); } int fork1(struct thread *td, int flags, int pages, struct proc **procp, int *procdescp, int pdflags, struct filecaps *fcaps) { struct proc *p1, *newproc; struct thread *td2; struct vmspace *vm2; struct file *fp_procdesc; vm_ooffset_t mem_charged; int error, nprocs_new, ok; static int curfail; static struct timeval lastfail; /* Check for the undefined or unimplemented flags. */ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0) return (EINVAL); /* Signal value requires RFTSIGZMB. */ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0) return (EINVAL); /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); /* Check the validity of the signal number. */ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG) return (EINVAL); if ((flags & RFPROCDESC) != 0) { /* Can't not create a process yet get a process descriptor. */ if ((flags & RFPROC) == 0) return (EINVAL); /* Must provide a place to put a procdesc if creating one. */ if (procdescp == NULL) return (EINVAL); } p1 = td->td_proc; /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { *procp = NULL; return (fork_norfproc(td, flags)); } fp_procdesc = NULL; newproc = NULL; vm2 = NULL; /* * Increment the nprocs resource before allocations occur. * Although process entries are dynamically created, we still * keep a global limit on the maximum number we will * create. There are hard-limits as to the number of processes * that can run, established by the KVA and memory usage for * the process data. * * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred, PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) { error = EAGAIN; sx_xlock(&allproc_lock); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxproc limit exceeded by uid %u (pid %d); " "see tuning(7) and login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } sx_xunlock(&allproc_lock); goto fail2; } /* * If required, create a process descriptor in the parent first; we * will abandon it if something goes wrong. We don't finit() until * later. */ if (flags & RFPROCDESC) { error = falloc_caps(td, &fp_procdesc, procdescp, 0, fcaps); if (error != 0) goto fail2; } mem_charged = 0; if (pages == 0) pages = kstack_pages; /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); td2 = FIRST_THREAD_IN_PROC(newproc); if (td2 == NULL) { td2 = thread_alloc(pages); if (td2 == NULL) { error = ENOMEM; goto fail2; } proc_linkup(newproc, td2); } else { if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) { if (td2->td_kstack != 0) vm_thread_dispose(td2); if (!thread_alloc_stack(td2, pages)) { error = ENOMEM; goto fail2; } } } if ((flags & RFMEM) == 0) { vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); if (vm2 == NULL) { error = ENOMEM; goto fail2; } if (!swap_reserve(mem_charged)) { /* * The swap reservation failed. The accounting * from the entries of the copied vm2 will be * substracted in vmspace_free(), so force the * reservation there. */ swap_reserve_force(mem_charged); error = ENOMEM; goto fail2; } } else vm2 = NULL; /* * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ proc_set_cred_init(newproc, crhold(td->td_ucred)); /* * Initialize resource accounting for the child process. */ error = racct_proc_fork(p1, newproc); if (error != 0) { error = EAGAIN; goto fail1; } #ifdef MAC mac_proc_init(newproc); #endif knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); sx_xlock(&allproc_lock); /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. * * XXXRW: Can we avoid privilege here if it's not needed? */ error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0); if (error == 0) ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0); else { ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPROC)); } if (ok) { do_fork(td, flags, newproc, td2, vm2, pdflags); /* * Return child proc pointer to parent. */ *procp = newproc; if (flags & RFPROCDESC) { procdesc_finit(newproc->p_procdesc, fp_procdesc); fdrop(fp_procdesc, td); } racct_proc_fork_done(newproc); return (0); } error = EAGAIN; sx_sunlock(&proctree_lock); sx_xunlock(&allproc_lock); #ifdef MAC mac_proc_destroy(newproc); #endif racct_proc_exit(newproc); fail1: crfree(newproc->p_ucred); newproc->p_ucred = NULL; fail2: if (vm2 != NULL) vmspace_free(vm2); uma_zfree(proc_zone, newproc); if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) { fdclose(td, fp_procdesc, *procdescp); fdrop(fp_procdesc, td); } atomic_add_int(&nprocs, -1); pause("fork", hz / 2); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(void (*callout)(void *, struct trapframe *), void *arg, struct trapframe *frame) { struct proc *p; struct thread *td; struct thread *dtd; td = curthread; p = td->td_proc; KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)", td, td->td_sched, p->p_pid, td->td_name); sched_fork_exit(td); /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((dtd = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(dtd); } thread_unlock(td); /* * cpu_set_fork_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ if (p->p_flag & P_KTHREAD) { printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", td->td_name, p->p_pid); kproc_exit(0); } mtx_assert(&Giant, MA_NOTOWNED); if (p->p_sysent->sv_schedtail != NULL) (p->p_sysent->sv_schedtail)(td); td->td_pflags &= ~TDP_FORKING; } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. Giant is not held on entry, and must not * be held on return. This function is passed in to fork_exit() as the * first parameter and is called when returning to a new userland process. */ void fork_return(struct thread *td, struct trapframe *frame) { struct proc *p, *dbg; p = td->td_proc; if (td->td_dbgflags & TDB_STOPATFORK) { sx_xlock(&proctree_lock); PROC_LOCK(p); if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED | P_FOLLOWFORK)) { /* * If debugger still wants auto-attach for the * parent's children, do it now. */ dbg = p->p_pptr->p_pptr; p->p_flag |= P_TRACED; p->p_oppid = p->p_pptr->p_pid; CTR2(KTR_PTRACE, "fork_return: attaching to new child pid %d: oppid %d", p->p_pid, p->p_oppid); proc_reparent(p, dbg); sx_xunlock(&proctree_lock); td->td_dbgflags |= TDB_CHILD | TDB_SCX; ptracestop(td, SIGSTOP); td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX); } else { /* * ... otherwise clear the request. */ sx_xunlock(&proctree_lock); td->td_dbgflags &= ~TDB_STOPATFORK; cv_broadcast(&p->p_dbgwait); } PROC_UNLOCK(p); } else if (p->p_flag & P_TRACED) { /* * This is the start of a new thread in a traced * process. Report a system call exit event. */ PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; _STOPEVENT(p, S_SCX, td->td_dbg_sc_code); if ((p->p_stops & S_PT_SCX) != 0) ptracestop(td, SIGTRAP); td->td_dbgflags &= ~TDB_SCX; PROC_UNLOCK(p); } userret(td, frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(SYS_fork, 0, 0); #endif } Index: head/sys/kern/kern_proc.c =================================================================== --- head/sys/kern/kern_proc.c (revision 292383) +++ head/sys/kern/kern_proc.c (revision 292384) @@ -1,3056 +1,3053 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #endif SDT_PROVIDER_DEFINE(proc); -SDT_PROBE_DEFINE4(proc, kernel, ctor, entry, "struct proc *", "int", - "void *", "int"); -SDT_PROBE_DEFINE4(proc, kernel, ctor, return, "struct proc *", "int", - "void *", "int"); -SDT_PROBE_DEFINE4(proc, kernel, dtor, entry, "struct proc *", "int", - "void *", "struct thread *"); -SDT_PROBE_DEFINE3(proc, kernel, dtor, return, "struct proc *", "int", - "void *"); -SDT_PROBE_DEFINE3(proc, kernel, init, entry, "struct proc *", "int", +SDT_PROBE_DEFINE4(proc, , ctor, entry, "struct proc *", "int", "void *", "int"); -SDT_PROBE_DEFINE3(proc, kernel, init, return, "struct proc *", "int", +SDT_PROBE_DEFINE4(proc, , ctor, return, "struct proc *", "int", "void *", "int"); +SDT_PROBE_DEFINE4(proc, , dtor, entry, "struct proc *", "int", "void *", + "struct thread *"); +SDT_PROBE_DEFINE3(proc, , dtor, return, "struct proc *", "int", "void *"); +SDT_PROBE_DEFINE3(proc, , init, entry, "struct proc *", "int", "int"); +SDT_PROBE_DEFINE3(proc, , init, return, "struct proc *", "int", "int"); MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); MALLOC_DEFINE(M_SESSION, "session", "session header"); static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); static void doenterpgrp(struct proc *, struct pgrp *); static void orphanpg(struct pgrp *pg); static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread); static void pgadjustjobc(struct pgrp *pgrp, int entering); static void pgdelete(struct pgrp *); static int proc_ctor(void *mem, int size, void *arg, int flags); static void proc_dtor(void *mem, int size, void *arg); static int proc_init(void *mem, int size, int flags); static void proc_fini(void *mem, int size); static void pargs_free(struct pargs *pa); static struct proc *zpfind_locked(pid_t pid); /* * Other process lists */ struct pidhashhead *pidhashtbl; u_long pidhash; struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc; struct proclist zombproc; struct sx allproc_lock; struct sx proctree_lock; struct mtx ppeers_lock; uma_zone_t proc_zone; /* * The offset of various fields in struct proc and struct thread. * These are used by kernel debuggers to enumerate kernel threads and * processes. */ const int proc_off_p_pid = offsetof(struct proc, p_pid); const int proc_off_p_comm = offsetof(struct proc, p_comm); const int proc_off_p_list = offsetof(struct proc, p_list); const int proc_off_p_threads = offsetof(struct proc, p_threads); const int thread_off_td_tid = offsetof(struct thread, td_tid); const int thread_off_td_name = offsetof(struct thread, td_name); const int thread_off_td_oncpu = offsetof(struct thread, td_oncpu); const int thread_off_td_pcb = offsetof(struct thread, td_pcb); const int thread_off_td_plist = offsetof(struct thread, td_plist); int kstack_pages = KSTACK_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "Kernel stack size in pages"); static int vmmap_skip_res_cnt = 0; SYSCTL_INT(_kern, OID_AUTO, proc_vmmap_skip_resident_count, CTLFLAG_RW, &vmmap_skip_res_cnt, 0, "Skip calculation of the pages resident count in kern.proc.vmmap"); CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); #ifdef COMPAT_FREEBSD32 CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE); #endif /* * Initialize global process hashing structures. */ void procinit() { sx_init(&allproc_lock, "allproc"); sx_init(&proctree_lock, "proctree"); mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); LIST_INIT(&allproc); LIST_INIT(&zombproc); pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uihashinit(); } /* * Prepare a proc for use. */ static int proc_ctor(void *mem, int size, void *arg, int flags) { struct proc *p; p = (struct proc *)mem; - SDT_PROBE4(proc, kernel, ctor , entry, p, size, arg, flags); + SDT_PROBE4(proc, , ctor , entry, p, size, arg, flags); EVENTHANDLER_INVOKE(process_ctor, p); - SDT_PROBE4(proc, kernel, ctor , return, p, size, arg, flags); + SDT_PROBE4(proc, , ctor , return, p, size, arg, flags); return (0); } /* * Reclaim a proc after use. */ static void proc_dtor(void *mem, int size, void *arg) { struct proc *p; struct thread *td; /* INVARIANTS checks go here */ p = (struct proc *)mem; td = FIRST_THREAD_IN_PROC(p); - SDT_PROBE4(proc, kernel, dtor, entry, p, size, arg, td); + SDT_PROBE4(proc, , dtor, entry, p, size, arg, td); if (td != NULL) { #ifdef INVARIANTS KASSERT((p->p_numthreads == 1), ("bad number of threads in exiting process")); KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr")); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); } EVENTHANDLER_INVOKE(process_dtor, p); if (p->p_ksi != NULL) KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue")); - SDT_PROBE3(proc, kernel, dtor, return, p, size, arg); + SDT_PROBE3(proc, , dtor, return, p, size, arg); } /* * Initialize type-stable parts of a proc (when newly created). */ static int proc_init(void *mem, int size, int flags) { struct proc *p; p = (struct proc *)mem; - SDT_PROBE3(proc, kernel, init, entry, p, size, flags); + SDT_PROBE3(proc, , init, entry, p, size, flags); p->p_sched = (struct p_sched *)&p[1]; mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW); mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_statmtx, "pstatl", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_itimmtx, "pitiml", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_profmtx, "pprofl", NULL, MTX_SPIN | MTX_NEW); cv_init(&p->p_pwait, "ppwait"); cv_init(&p->p_dbgwait, "dbgwait"); TAILQ_INIT(&p->p_threads); /* all threads in proc */ EVENTHANDLER_INVOKE(process_init, p); p->p_stats = pstats_alloc(); - SDT_PROBE3(proc, kernel, init, return, p, size, flags); + SDT_PROBE3(proc, , init, return, p, size, flags); return (0); } /* * UMA should ensure that this function is never called. * Freeing a proc structure would violate type stability. */ static void proc_fini(void *mem, int size) { #ifdef notnow struct proc *p; p = (struct proc *)mem; EVENTHANDLER_INVOKE(process_fini, p); pstats_free(p->p_stats); thread_free(FIRST_THREAD_IN_PROC(p)); mtx_destroy(&p->p_mtx); if (p->p_ksi != NULL) ksiginfo_free(p->p_ksi); #else panic("proc reclaimed"); #endif } /* * Is p an inferior of the current process? */ int inferior(struct proc *p) { sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); for (; p != curproc; p = proc_realparent(p)) { if (p->p_pid == 0) return (0); } return (1); } struct proc * pfind_locked(pid_t pid) { struct proc *p; sx_assert(&allproc_lock, SX_LOCKED); LIST_FOREACH(p, PIDHASH(pid), p_hash) { if (p->p_pid == pid) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); p = NULL; } break; } } return (p); } /* * Locate a process by number; return only "live" processes -- i.e., neither * zombies nor newly born but incompletely initialized processes. By not * returning processes in the PRS_NEW state, we allow callers to avoid * testing for that condition to avoid dereferencing p_ucred, et al. */ struct proc * pfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); p = pfind_locked(pid); sx_sunlock(&allproc_lock); return (p); } static struct proc * pfind_tid_locked(pid_t tid) { struct proc *p; struct thread *td; sx_assert(&allproc_lock, SX_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) goto found; } PROC_UNLOCK(p); } found: return (p); } /* * Locate a process group by number. * The caller must hold proctree_lock. */ struct pgrp * pgfind(pgid) register pid_t pgid; { register struct pgrp *pgrp; sx_assert(&proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { PGRP_LOCK(pgrp); return (pgrp); } } return (NULL); } /* * Locate process and do additional manipulations, depending on flags. */ int pget(pid_t pid, int flags, struct proc **pp) { struct proc *p; int error; sx_slock(&allproc_lock); if (pid <= PID_MAX) { p = pfind_locked(pid); if (p == NULL && (flags & PGET_NOTWEXIT) == 0) p = zpfind_locked(pid); } else if ((flags & PGET_NOTID) == 0) { p = pfind_tid_locked(pid); } else { p = NULL; } sx_sunlock(&allproc_lock); if (p == NULL) return (ESRCH); if ((flags & PGET_CANSEE) != 0) { error = p_cansee(curthread, p); if (error != 0) goto errout; } if ((flags & PGET_CANDEBUG) != 0) { error = p_candebug(curthread, p); if (error != 0) goto errout; } if ((flags & PGET_ISCURRENT) != 0 && curproc != p) { error = EPERM; goto errout; } if ((flags & PGET_NOTWEXIT) != 0 && (p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto errout; } if ((flags & PGET_NOTINEXEC) != 0 && (p->p_flag & P_INEXEC) != 0) { /* * XXXRW: Not clear ESRCH is the right error during proc * execve(). */ error = ESRCH; goto errout; } if ((flags & PGET_HOLD) != 0) { _PHOLD(p); PROC_UNLOCK(p); } *pp = p; return (0); errout: PROC_UNLOCK(p); return (error); } /* * Create a new process group. * pgid must be equal to the pid of p. * Begin a new session if required. */ int enterpgrp(p, pgid, pgrp, sess) register struct proc *p; pid_t pgid; struct pgrp *pgrp; struct session *sess; { sx_assert(&proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, ("enterpgrp: new pgrp and pid != pgid")); KASSERT(pgfind(pgid) == NULL, ("enterpgrp: pgrp with pgid exists")); KASSERT(!SESS_LEADER(p), ("enterpgrp: session leader attempted setpgrp")); mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); if (sess != NULL) { /* * new session */ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF); PROC_LOCK(p); p->p_flag &= ~P_CONTROLT; PROC_UNLOCK(p); PGRP_LOCK(pgrp); sess->s_leader = p; sess->s_sid = p->p_pid; refcount_init(&sess->s_count, 1); sess->s_ttyvp = NULL; sess->s_ttydp = NULL; sess->s_ttyp = NULL; bcopy(p->p_session->s_login, sess->s_login, sizeof(sess->s_login)); pgrp->pg_session = sess; KASSERT(p == curproc, ("enterpgrp: mksession and p != curproc")); } else { pgrp->pg_session = p->p_session; sess_hold(pgrp->pg_session); PGRP_LOCK(pgrp); } pgrp->pg_id = pgid; LIST_INIT(&pgrp->pg_members); /* * As we have an exclusive lock of proctree_lock, * this should not deadlock. */ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); pgrp->pg_jobc = 0; SLIST_INIT(&pgrp->pg_sigiolst); PGRP_UNLOCK(pgrp); doenterpgrp(p, pgrp); return (0); } /* * Move p to an existing process group */ int enterthispgrp(p, pgrp) register struct proc *p; struct pgrp *pgrp; { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); KASSERT(pgrp->pg_session == p->p_session, ("%s: pgrp's session %p, p->p_session %p.\n", __func__, pgrp->pg_session, p->p_session)); KASSERT(pgrp != p->p_pgrp, ("%s: p belongs to pgrp.", __func__)); doenterpgrp(p, pgrp); return (0); } /* * Move p to a process group */ static void doenterpgrp(p, pgrp) struct proc *p; struct pgrp *pgrp; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); savepgrp = p->p_pgrp; /* * Adjust eligibility of affected pgrps to participate in job control. * Increment eligibility counts before decrementing, otherwise we * could reach 0 spuriously during the first call. */ fixjobc(p, pgrp, 1); fixjobc(p, p->p_pgrp, 0); PGRP_LOCK(pgrp); PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = pgrp; PROC_UNLOCK(p); LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); PGRP_UNLOCK(savepgrp); PGRP_UNLOCK(pgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); } /* * remove process from process group */ int leavepgrp(p) register struct proc *p; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = NULL; PROC_UNLOCK(p); PGRP_UNLOCK(savepgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); return (0); } /* * delete a process group */ static void pgdelete(pgrp) register struct pgrp *pgrp; { struct session *savesess; struct tty *tp; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pgid. */ funsetownlst(&pgrp->pg_sigiolst); PGRP_LOCK(pgrp); tp = pgrp->pg_session->s_ttyp; LIST_REMOVE(pgrp, pg_hash); savesess = pgrp->pg_session; PGRP_UNLOCK(pgrp); /* Remove the reference to the pgrp before deallocating it. */ if (tp != NULL) { tty_lock(tp); tty_rel_pgrp(tp, pgrp); } mtx_destroy(&pgrp->pg_mtx); free(pgrp, M_PGRP); sess_release(savesess); } static void pgadjustjobc(pgrp, entering) struct pgrp *pgrp; int entering; { PGRP_LOCK(pgrp); if (entering) pgrp->pg_jobc++; else { --pgrp->pg_jobc; if (pgrp->pg_jobc == 0) orphanpg(pgrp); } PGRP_UNLOCK(pgrp); } /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different * process group of the same session). If that count reaches zero, the * process group becomes orphaned. Check both the specified process' * process group and that of its children. * entering == 0 => p is leaving specified group. * entering == 1 => p is entering specified group. */ void fixjobc(p, pgrp, entering) register struct proc *p; register struct pgrp *pgrp; int entering; { register struct pgrp *hispgrp; register struct session *mysession; sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Check p's parent to see whether p qualifies its own process * group; if so, adjust count for p's process group. */ mysession = pgrp->pg_session; if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && hispgrp->pg_session == mysession) pgadjustjobc(pgrp, entering); /* * Check this process' children to see whether they qualify * their process groups; if so, adjust counts for children's * process groups. */ LIST_FOREACH(p, &p->p_children, p_sibling) { hispgrp = p->p_pgrp; if (hispgrp == pgrp || hispgrp->pg_session != mysession) continue; PROC_LOCK(p); if (p->p_state == PRS_ZOMBIE) { PROC_UNLOCK(p); continue; } PROC_UNLOCK(p); pgadjustjobc(hispgrp, entering); } } /* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. */ static void orphanpg(pg) struct pgrp *pg; { register struct proc *p; PGRP_LOCK_ASSERT(pg, MA_OWNED); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (P_SHOULDSTOP(p) == P_STOPPED_SIG) { PROC_UNLOCK(p); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); kern_psignal(p, SIGHUP); kern_psignal(p, SIGCONT); PROC_UNLOCK(p); } return; } PROC_UNLOCK(p); } } void sess_hold(struct session *s) { refcount_acquire(&s->s_count); } void sess_release(struct session *s) { if (refcount_release(&s->s_count)) { if (s->s_ttyp != NULL) { tty_lock(s->s_ttyp); tty_rel_sess(s->s_ttyp, s); } mtx_destroy(&s->s_mtx); free(s, M_SESSION); } } #ifdef DDB DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; register struct proc *p; register int i; for (i = 0; i <= pgrphash; i++) { if (!LIST_EMPTY(&pgrphashtbl[i])) { printf("\tindx %d\n", i); LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, (void *)pgrp->pg_session, pgrp->pg_session->s_count, (void *)LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { printf("\t\tpid %ld addr %p pgrp %p\n", (long)p->p_pid, (void *)p, (void *)p->p_pgrp); } } } } } #endif /* DDB */ /* * Calculate the kinfo_proc members which contain process-wide * informations. * Must be called with the target process locked. */ static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); kp->ki_estcpu = 0; kp->ki_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); kp->ki_pctcpu += sched_pctcpu(td); kp->ki_estcpu += td->td_estcpu; thread_unlock(td); } } /* * Clear kinfo_proc and fill in any information that is common * to all threads in the process. * Must be called with the target process locked. */ static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp) { struct thread *td0; struct tty *tp; struct session *sp; struct ucred *cred; struct sigacts *ps; /* For proc_realparent. */ sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); bzero(kp, sizeof(*kp)); kp->ki_structsize = sizeof(*kp); kp->ki_paddr = p; kp->ki_addr =/* p->p_addr; */0; /* XXX */ kp->ki_args = p->p_args; kp->ki_textvp = p->p_textvp; #ifdef KTRACE kp->ki_tracep = p->p_tracevp; kp->ki_traceflag = p->p_traceflag; #endif kp->ki_fd = p->p_fd; kp->ki_vmspace = p->p_vmspace; kp->ki_flag = p->p_flag; kp->ki_flag2 = p->p_flag2; cred = p->p_ucred; if (cred) { kp->ki_uid = cred->cr_uid; kp->ki_ruid = cred->cr_ruid; kp->ki_svuid = cred->cr_svuid; kp->ki_cr_flags = 0; if (cred->cr_flags & CRED_FLAG_CAPMODE) kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE; /* XXX bde doesn't like KI_NGROUPS */ if (cred->cr_ngroups > KI_NGROUPS) { kp->ki_ngroups = KI_NGROUPS; kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW; } else kp->ki_ngroups = cred->cr_ngroups; bcopy(cred->cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_rgid = cred->cr_rgid; kp->ki_svgid = cred->cr_svgid; /* If jailed(cred), emulate the old P_JAILED flag. */ if (jailed(cred)) { kp->ki_flag |= P_JAILED; /* If inside the jail, use 0 as a jail ID. */ if (cred->cr_prison != curthread->td_ucred->cr_prison) kp->ki_jid = cred->cr_prison->pr_id; } strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name, sizeof(kp->ki_loginclass)); } ps = p->p_sigacts; if (ps) { mtx_lock(&ps->ps_mtx); kp->ki_sigignore = ps->ps_sigignore; kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { struct vmspace *vm = p->p_vmspace; kp->ki_size = vm->vm_map.size; kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ FOREACH_THREAD_IN_PROC(p, td0) { if (!TD_IS_SWAPPED(td0)) kp->ki_rssize += td0->td_kstack_pages; } kp->ki_swrss = vm->vm_swrss; kp->ki_tsize = vm->vm_tsize; kp->ki_dsize = vm->vm_dsize; kp->ki_ssize = vm->vm_ssize; } else if (p->p_state == PRS_ZOMBIE) kp->ki_stat = SZOMB; if (kp->ki_flag & P_INMEM) kp->ki_sflag = PS_INMEM; else kp->ki_sflag = 0; /* Calculate legacy swtime as seconds since 'swtick'. */ kp->ki_swtime = (ticks - p->p_swtick) / hz; kp->ki_pid = p->p_pid; kp->ki_nice = p->p_nice; kp->ki_fibnum = p->p_fibnum; kp->ki_start = p->p_stats->p_start; timevaladd(&kp->ki_start, &boottime); PROC_STATLOCK(p); rufetch(p, &kp->ki_rusage); kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime); calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime); PROC_STATUNLOCK(p); calccru(p, &kp->ki_childutime, &kp->ki_childstime); /* Some callers want child times in a single value. */ kp->ki_childtime = kp->ki_childstime; timevaladd(&kp->ki_childtime, &kp->ki_childutime); FOREACH_THREAD_IN_PROC(p, td0) kp->ki_cow += td0->td_cow; tp = NULL; if (p->p_pgrp) { kp->ki_pgid = p->p_pgrp->pg_id; kp->ki_jobc = p->p_pgrp->pg_jobc; sp = p->p_pgrp->pg_session; if (sp != NULL) { kp->ki_sid = sp->s_sid; SESS_LOCK(sp); strlcpy(kp->ki_login, sp->s_login, sizeof(kp->ki_login)); if (sp->s_ttyvp) kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; /* XXX proctree_lock */ tp = sp->s_ttyp; SESS_UNLOCK(sp); } } if ((p->p_flag & P_CONTROLT) && tp != NULL) { kp->ki_tdev = tty_udev(tp); kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; if (tp->t_session) kp->ki_tsid = tp->t_session->s_sid; } else kp->ki_tdev = NODEV; if (p->p_comm[0] != '\0') strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm)); if (p->p_sysent && p->p_sysent->sv_name != NULL && p->p_sysent->sv_name[0] != '\0') strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul)); kp->ki_siglist = p->p_siglist; kp->ki_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig); kp->ki_acflag = p->p_acflag; kp->ki_lock = p->p_lock; if (p->p_pptr) { kp->ki_ppid = proc_realparent(p)->p_pid; if (p->p_flag & P_TRACED) kp->ki_tracer = p->p_pptr->p_pid; } } /* * Fill in information that is thread specific. Must be called with * target process locked. If 'preferthread' is set, overwrite certain * process-related fields that are maintained for both threads and * processes. */ static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread) { struct proc *p; p = td->td_proc; kp->ki_tdaddr = td; PROC_LOCK_ASSERT(p, MA_OWNED); if (preferthread) PROC_STATLOCK(p); thread_lock(td); if (td->td_wmesg != NULL) strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); else bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg)); strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname)); if (TD_ON_LOCK(td)) { kp->ki_kiflag |= KI_LOCKBLOCK; strlcpy(kp->ki_lockname, td->td_lockname, sizeof(kp->ki_lockname)); } else { kp->ki_kiflag &= ~KI_LOCKBLOCK; bzero(kp->ki_lockname, sizeof(kp->ki_lockname)); } if (p->p_state == PRS_NORMAL) { /* approximate. */ if (TD_ON_RUNQ(td) || TD_CAN_RUN(td) || TD_IS_RUNNING(td)) { kp->ki_stat = SRUN; } else if (P_SHOULDSTOP(p)) { kp->ki_stat = SSTOP; } else if (TD_IS_SLEEPING(td)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(td)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } else if (p->p_state == PRS_ZOMBIE) { kp->ki_stat = SZOMB; } else { kp->ki_stat = SIDL; } /* Things in the thread */ kp->ki_wchan = td->td_wchan; kp->ki_pri.pri_level = td->td_priority; kp->ki_pri.pri_native = td->td_base_pri; /* * Note: legacy fields; clamp at the old NOCPU value and/or * the maximum u_char CPU value. */ if (td->td_lastcpu == NOCPU) kp->ki_lastcpu_old = NOCPU_OLD; else if (td->td_lastcpu > MAXCPU_OLD) kp->ki_lastcpu_old = MAXCPU_OLD; else kp->ki_lastcpu_old = td->td_lastcpu; if (td->td_oncpu == NOCPU) kp->ki_oncpu_old = NOCPU_OLD; else if (td->td_oncpu > MAXCPU_OLD) kp->ki_oncpu_old = MAXCPU_OLD; else kp->ki_oncpu_old = td->td_oncpu; kp->ki_lastcpu = td->td_lastcpu; kp->ki_oncpu = td->td_oncpu; kp->ki_tdflags = td->td_flags; kp->ki_tid = td->td_tid; kp->ki_numthreads = p->p_numthreads; kp->ki_pcb = td->td_pcb; kp->ki_kstack = (void *)td->td_kstack; kp->ki_slptime = (ticks - td->td_slptick) / hz; kp->ki_pri.pri_class = td->td_pri_class; kp->ki_pri.pri_user = td->td_user_pri; if (preferthread) { rufetchtd(td, &kp->ki_rusage); kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime); kp->ki_pctcpu = sched_pctcpu(td); kp->ki_estcpu = td->td_estcpu; kp->ki_cow = td->td_cow; } /* We can't get this anymore but ps etc never used it anyway. */ kp->ki_rqindex = 0; if (preferthread) kp->ki_siglist = td->td_siglist; kp->ki_sigmask = td->td_sigmask; thread_unlock(td); if (preferthread) PROC_STATUNLOCK(p); } /* * Fill in a kinfo_proc structure for the specified process. * Must be called with the target process locked. */ void fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) { MPASS(FIRST_THREAD_IN_PROC(p) != NULL); fill_kinfo_proc_only(p, kp); fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0); fill_kinfo_aggregate(p, kp); } struct pstats * pstats_alloc(void) { return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK)); } /* * Copy parts of p_stats; zero the rest of p_stats (statistics). */ void pstats_fork(struct pstats *src, struct pstats *dst) { bzero(&dst->pstat_startzero, __rangeof(struct pstats, pstat_startzero, pstat_endzero)); bcopy(&src->pstat_startcopy, &dst->pstat_startcopy, __rangeof(struct pstats, pstat_startcopy, pstat_endcopy)); } void pstats_free(struct pstats *ps) { free(ps, M_SUBPROC); } static struct proc * zpfind_locked(pid_t pid) { struct proc *p; sx_assert(&allproc_lock, SX_LOCKED); LIST_FOREACH(p, &zombproc, p_list) { if (p->p_pid == pid) { PROC_LOCK(p); break; } } return (p); } /* * Locate a zombie process by number */ struct proc * zpfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); p = zpfind_locked(pid); sx_sunlock(&allproc_lock); return (p); } #ifdef COMPAT_FREEBSD32 /* * This function is typically used to copy out the kernel address, so * it can be replaced by assignment of zero. */ static inline uint32_t ptr32_trim(void *ptr) { uintptr_t uptr; uptr = (uintptr_t)ptr; return ((uptr > UINT_MAX) ? 0 : uptr); } #define PTRTRIM_CP(src,dst,fld) \ do { (dst).fld = ptr32_trim((src).fld); } while (0) static void freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32) { int i; bzero(ki32, sizeof(struct kinfo_proc32)); ki32->ki_structsize = sizeof(struct kinfo_proc32); CP(*ki, *ki32, ki_layout); PTRTRIM_CP(*ki, *ki32, ki_args); PTRTRIM_CP(*ki, *ki32, ki_paddr); PTRTRIM_CP(*ki, *ki32, ki_addr); PTRTRIM_CP(*ki, *ki32, ki_tracep); PTRTRIM_CP(*ki, *ki32, ki_textvp); PTRTRIM_CP(*ki, *ki32, ki_fd); PTRTRIM_CP(*ki, *ki32, ki_vmspace); PTRTRIM_CP(*ki, *ki32, ki_wchan); CP(*ki, *ki32, ki_pid); CP(*ki, *ki32, ki_ppid); CP(*ki, *ki32, ki_pgid); CP(*ki, *ki32, ki_tpgid); CP(*ki, *ki32, ki_sid); CP(*ki, *ki32, ki_tsid); CP(*ki, *ki32, ki_jobc); CP(*ki, *ki32, ki_tdev); CP(*ki, *ki32, ki_siglist); CP(*ki, *ki32, ki_sigmask); CP(*ki, *ki32, ki_sigignore); CP(*ki, *ki32, ki_sigcatch); CP(*ki, *ki32, ki_uid); CP(*ki, *ki32, ki_ruid); CP(*ki, *ki32, ki_svuid); CP(*ki, *ki32, ki_rgid); CP(*ki, *ki32, ki_svgid); CP(*ki, *ki32, ki_ngroups); for (i = 0; i < KI_NGROUPS; i++) CP(*ki, *ki32, ki_groups[i]); CP(*ki, *ki32, ki_size); CP(*ki, *ki32, ki_rssize); CP(*ki, *ki32, ki_swrss); CP(*ki, *ki32, ki_tsize); CP(*ki, *ki32, ki_dsize); CP(*ki, *ki32, ki_ssize); CP(*ki, *ki32, ki_xstat); CP(*ki, *ki32, ki_acflag); CP(*ki, *ki32, ki_pctcpu); CP(*ki, *ki32, ki_estcpu); CP(*ki, *ki32, ki_slptime); CP(*ki, *ki32, ki_swtime); CP(*ki, *ki32, ki_cow); CP(*ki, *ki32, ki_runtime); TV_CP(*ki, *ki32, ki_start); TV_CP(*ki, *ki32, ki_childtime); CP(*ki, *ki32, ki_flag); CP(*ki, *ki32, ki_kiflag); CP(*ki, *ki32, ki_traceflag); CP(*ki, *ki32, ki_stat); CP(*ki, *ki32, ki_nice); CP(*ki, *ki32, ki_lock); CP(*ki, *ki32, ki_rqindex); CP(*ki, *ki32, ki_oncpu); CP(*ki, *ki32, ki_lastcpu); /* XXX TODO: wrap cpu value as appropriate */ CP(*ki, *ki32, ki_oncpu_old); CP(*ki, *ki32, ki_lastcpu_old); bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1); bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1); bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1); bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1); bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1); bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1); bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1); CP(*ki, *ki32, ki_tracer); CP(*ki, *ki32, ki_flag2); CP(*ki, *ki32, ki_fibnum); CP(*ki, *ki32, ki_cr_flags); CP(*ki, *ki32, ki_jid); CP(*ki, *ki32, ki_numthreads); CP(*ki, *ki32, ki_tid); CP(*ki, *ki32, ki_pri); freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage); freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch); PTRTRIM_CP(*ki, *ki32, ki_pcb); PTRTRIM_CP(*ki, *ki32, ki_kstack); PTRTRIM_CP(*ki, *ki32, ki_udata); CP(*ki, *ki32, ki_sflag); CP(*ki, *ki32, ki_tdflags); } #endif int kern_proc_out(struct proc *p, struct sbuf *sb, int flags) { struct thread *td; struct kinfo_proc ki; #ifdef COMPAT_FREEBSD32 struct kinfo_proc32 ki32; #endif int error; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(FIRST_THREAD_IN_PROC(p) != NULL); error = 0; fill_kinfo_proc(p, &ki); if ((flags & KERN_PROC_NOTHREADS) != 0) { #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { freebsd32_kinfo_proc_out(&ki, &ki32); if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0) error = ENOMEM; } else #endif if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0) error = ENOMEM; } else { FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &ki, 1); #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { freebsd32_kinfo_proc_out(&ki, &ki32); if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0) error = ENOMEM; } else #endif if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0) error = ENOMEM; if (error != 0) break; } } PROC_UNLOCK(p); return (error); } static int sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags, int doingzomb) { struct sbuf sb; struct kinfo_proc ki; struct proc *np; int error, error2; pid_t pid; pid = p->p_pid; sbuf_new_for_sysctl(&sb, (char *)&ki, sizeof(ki), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = kern_proc_out(p, &sb, flags); error2 = sbuf_finish(&sb); sbuf_delete(&sb); if (error != 0) return (error); else if (error2 != 0) return (error2); if (doingzomb) np = zpfind(pid); else { if (pid == 0) return (0); np = pfind(pid); } if (np == NULL) return (ESRCH); if (np != p) { PROC_UNLOCK(np); return (ESRCH); } PROC_UNLOCK(np); return (0); } static int sysctl_kern_proc(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int flags, doingzomb, oid_number; int error = 0; oid_number = oidp->oid_number; if (oid_number != KERN_PROC_ALL && (oid_number & KERN_PROC_INC_THREAD) == 0) flags = KERN_PROC_NOTHREADS; else { flags = 0; oid_number &= ~KERN_PROC_INC_THREAD; } #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) flags |= KERN_PROC_MASK32; #endif if (oid_number == KERN_PROC_PID) { if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); sx_slock(&proctree_lock); error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error == 0) error = sysctl_out_proc(p, req, flags, 0); sx_sunlock(&proctree_lock); return (error); } switch (oid_number) { case KERN_PROC_ALL: if (namelen != 0) return (EINVAL); break; case KERN_PROC_PROC: if (namelen != 0 && namelen != 1) return (EINVAL); break; default: if (namelen != 1) return (EINVAL); break; } if (!req->oldptr) { /* overestimate by 5 procs */ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); if (error) return (error); } error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_slock(&proctree_lock); sx_slock(&allproc_lock); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { if (!doingzomb) p = LIST_FIRST(&allproc); else p = LIST_FIRST(&zombproc); for (; p != 0; p = LIST_NEXT(p, p_list)) { /* * Skip embryonic processes. */ PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } KASSERT(p->p_ucred != NULL, ("process credential is NULL for non-NEW proc")); /* * Show a user only appropriate processes. */ if (p_cansee(curthread, p)) { PROC_UNLOCK(p); continue; } /* * TODO - make more efficient (see notes below). * do by session. */ switch (oid_number) { case KERN_PROC_GID: if (p->p_ucred->cr_gid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (p->p_pgrp == NULL || p->p_pgrp->pg_id != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RGID: if (p->p_ucred->cr_rgid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_SESSION: if (p->p_session == NULL || p->p_session->s_sid != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_TTY: if ((p->p_flag & P_CONTROLT) == 0 || p->p_session == NULL) { PROC_UNLOCK(p); continue; } /* XXX proctree_lock */ SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || tty_udev(p->p_session->s_ttyp) != (dev_t)name[0]) { SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); continue; } SESS_UNLOCK(p->p_session); break; case KERN_PROC_UID: if (p->p_ucred->cr_uid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RUID: if (p->p_ucred->cr_ruid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PROC: break; default: break; } error = sysctl_out_proc(p, req, flags, doingzomb); if (error) { sx_sunlock(&allproc_lock); sx_sunlock(&proctree_lock); return (error); } } } sx_sunlock(&allproc_lock); sx_sunlock(&proctree_lock); return (0); } struct pargs * pargs_alloc(int len) { struct pargs *pa; pa = malloc(sizeof(struct pargs) + len, M_PARGS, M_WAITOK); refcount_init(&pa->ar_ref, 1); pa->ar_length = len; return (pa); } static void pargs_free(struct pargs *pa) { free(pa, M_PARGS); } void pargs_hold(struct pargs *pa) { if (pa == NULL) return; refcount_acquire(&pa->ar_ref); } void pargs_drop(struct pargs *pa) { if (pa == NULL) return; if (refcount_release(&pa->ar_ref)) pargs_free(pa); } static int proc_read_string(struct thread *td, struct proc *p, const char *sptr, char *buf, size_t len) { ssize_t n; /* * This may return a short read if the string is shorter than the chunk * and is aligned at the end of the page, and the following page is not * mapped. */ n = proc_readmem(td, p, (vm_offset_t)sptr, buf, len); if (n <= 0) return (ENOMEM); return (0); } #define PROC_AUXV_MAX 256 /* Safety limit on auxv size. */ enum proc_vector_type { PROC_ARG, PROC_ENV, PROC_AUX, }; #ifdef COMPAT_FREEBSD32 static int get_proc_vector32(struct thread *td, struct proc *p, char ***proc_vectorp, size_t *vsizep, enum proc_vector_type type) { struct freebsd32_ps_strings pss; Elf32_Auxinfo aux; vm_offset_t vptr, ptr; uint32_t *proc_vector32; char **proc_vector; size_t vsize, size; int i, error; error = 0; if (proc_readmem(td, p, (vm_offset_t)p->p_sysent->sv_psstrings, &pss, sizeof(pss)) != sizeof(pss)) return (ENOMEM); switch (type) { case PROC_ARG: vptr = (vm_offset_t)PTRIN(pss.ps_argvstr); vsize = pss.ps_nargvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(int32_t); break; case PROC_ENV: vptr = (vm_offset_t)PTRIN(pss.ps_envstr); vsize = pss.ps_nenvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(int32_t); break; case PROC_AUX: vptr = (vm_offset_t)PTRIN(pss.ps_envstr) + (pss.ps_nenvstr + 1) * sizeof(int32_t); if (vptr % 4 != 0) return (ENOEXEC); for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) { if (proc_readmem(td, p, ptr, &aux, sizeof(aux)) != sizeof(aux)) return (ENOMEM); if (aux.a_type == AT_NULL) break; ptr += sizeof(aux); } if (aux.a_type != AT_NULL) return (ENOEXEC); vsize = i + 1; size = vsize * sizeof(aux); break; default: KASSERT(0, ("Wrong proc vector type: %d", type)); return (EINVAL); } proc_vector32 = malloc(size, M_TEMP, M_WAITOK); if (proc_readmem(td, p, vptr, proc_vector32, size) != size) { error = ENOMEM; goto done; } if (type == PROC_AUX) { *proc_vectorp = (char **)proc_vector32; *vsizep = vsize; return (0); } proc_vector = malloc(vsize * sizeof(char *), M_TEMP, M_WAITOK); for (i = 0; i < (int)vsize; i++) proc_vector[i] = PTRIN(proc_vector32[i]); *proc_vectorp = proc_vector; *vsizep = vsize; done: free(proc_vector32, M_TEMP); return (error); } #endif static int get_proc_vector(struct thread *td, struct proc *p, char ***proc_vectorp, size_t *vsizep, enum proc_vector_type type) { struct ps_strings pss; Elf_Auxinfo aux; vm_offset_t vptr, ptr; char **proc_vector; size_t vsize, size; int i; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32) != 0) return (get_proc_vector32(td, p, proc_vectorp, vsizep, type)); #endif if (proc_readmem(td, p, (vm_offset_t)p->p_sysent->sv_psstrings, &pss, sizeof(pss)) != sizeof(pss)) return (ENOMEM); switch (type) { case PROC_ARG: vptr = (vm_offset_t)pss.ps_argvstr; vsize = pss.ps_nargvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(char *); break; case PROC_ENV: vptr = (vm_offset_t)pss.ps_envstr; vsize = pss.ps_nenvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(char *); break; case PROC_AUX: /* * The aux array is just above env array on the stack. Check * that the address is naturally aligned. */ vptr = (vm_offset_t)pss.ps_envstr + (pss.ps_nenvstr + 1) * sizeof(char *); #if __ELF_WORD_SIZE == 64 if (vptr % sizeof(uint64_t) != 0) #else if (vptr % sizeof(uint32_t) != 0) #endif return (ENOEXEC); /* * We count the array size reading the aux vectors from the * stack until AT_NULL vector is returned. So (to keep the code * simple) we read the process stack twice: the first time here * to find the size and the second time when copying the vectors * to the allocated proc_vector. */ for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) { if (proc_readmem(td, p, ptr, &aux, sizeof(aux)) != sizeof(aux)) return (ENOMEM); if (aux.a_type == AT_NULL) break; ptr += sizeof(aux); } /* * If the PROC_AUXV_MAX entries are iterated over, and we have * not reached AT_NULL, it is most likely we are reading wrong * data: either the process doesn't have auxv array or data has * been modified. Return the error in this case. */ if (aux.a_type != AT_NULL) return (ENOEXEC); vsize = i + 1; size = vsize * sizeof(aux); break; default: KASSERT(0, ("Wrong proc vector type: %d", type)); return (EINVAL); /* In case we are built without INVARIANTS. */ } proc_vector = malloc(size, M_TEMP, M_WAITOK); if (proc_readmem(td, p, vptr, proc_vector, size) != size) { free(proc_vector, M_TEMP); return (ENOMEM); } *proc_vectorp = proc_vector; *vsizep = vsize; return (0); } #define GET_PS_STRINGS_CHUNK_SZ 256 /* Chunk size (bytes) for ps_strings operations. */ static int get_ps_strings(struct thread *td, struct proc *p, struct sbuf *sb, enum proc_vector_type type) { size_t done, len, nchr, vsize; int error, i; char **proc_vector, *sptr; char pss_string[GET_PS_STRINGS_CHUNK_SZ]; PROC_ASSERT_HELD(p); /* * We are not going to read more than 2 * (PATH_MAX + ARG_MAX) bytes. */ nchr = 2 * (PATH_MAX + ARG_MAX); error = get_proc_vector(td, p, &proc_vector, &vsize, type); if (error != 0) return (error); for (done = 0, i = 0; i < (int)vsize && done < nchr; i++) { /* * The program may have scribbled into its argv array, e.g. to * remove some arguments. If that has happened, break out * before trying to read from NULL. */ if (proc_vector[i] == NULL) break; for (sptr = proc_vector[i]; ; sptr += GET_PS_STRINGS_CHUNK_SZ) { error = proc_read_string(td, p, sptr, pss_string, sizeof(pss_string)); if (error != 0) goto done; len = strnlen(pss_string, GET_PS_STRINGS_CHUNK_SZ); if (done + len >= nchr) len = nchr - done - 1; sbuf_bcat(sb, pss_string, len); if (len != GET_PS_STRINGS_CHUNK_SZ) break; done += GET_PS_STRINGS_CHUNK_SZ; } sbuf_bcat(sb, "", 1); done += len + 1; } done: free(proc_vector, M_TEMP); return (error); } int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb) { return (get_ps_strings(curthread, p, sb, PROC_ARG)); } int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb) { return (get_ps_strings(curthread, p, sb, PROC_ENV)); } int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb) { size_t vsize, size; char **auxv; int error; error = get_proc_vector(td, p, &auxv, &vsize, PROC_AUX); if (error == 0) { #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32) != 0) size = vsize * sizeof(Elf32_Auxinfo); else #endif size = vsize * sizeof(Elf_Auxinfo); if (sbuf_bcat(sb, auxv, size) != 0) error = ENOMEM; free(auxv, M_TEMP); } return (error); } /* * This sysctl allows a process to retrieve the argument list or process * title for another process without groping around in the address space * of the other process. It also allow a process to set its own "process * title to a string of its own choice. */ static int sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct pargs *newpa, *pa; struct proc *p; struct sbuf sb; int flags, error = 0, error2; if (namelen != 1) return (EINVAL); flags = PGET_CANSEE; if (req->newptr != NULL) flags |= PGET_ISCURRENT; error = pget((pid_t)name[0], flags, &p); if (error) return (error); pa = p->p_args; if (pa != NULL) { pargs_hold(pa); PROC_UNLOCK(p); error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); pargs_drop(pa); } else if ((p->p_flag & (P_WEXIT | P_SYSTEM)) == 0) { _PHOLD(p); PROC_UNLOCK(p); sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getargv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); if (error == 0 && error2 != 0) error = error2; } else { PROC_UNLOCK(p); } if (error != 0 || req->newptr == NULL) return (error); if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit) return (ENOMEM); newpa = pargs_alloc(req->newlen); error = SYSCTL_IN(req, newpa->ar_args, req->newlen); if (error != 0) { pargs_free(newpa); return (error); } PROC_LOCK(p); pa = p->p_args; p->p_args = newpa; PROC_UNLOCK(p); pargs_drop(pa); return (0); } /* * This sysctl allows a process to retrieve environment of another process. */ static int sysctl_kern_proc_env(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct sbuf sb; int error, error2; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); if ((p->p_flag & P_SYSTEM) != 0) { PRELE(p); return (0); } sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getenvv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); return (error != 0 ? error : error2); } /* * This sysctl allows a process to retrieve ELF auxiliary vector of * another process. */ static int sysctl_kern_proc_auxv(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct sbuf sb; int error, error2; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); if ((p->p_flag & P_SYSTEM) != 0) { PRELE(p); return (0); } sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getauxv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); return (error != 0 ? error : error2); } /* * This sysctl allows a process to retrieve the path of the executable for * itself or another process. */ static int sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct vnode *vp; char *retbuf, *freebuf; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; } else { error = pget(*pidp, PGET_CANSEE, &p); if (error != 0) return (error); } vp = p->p_textvp; if (vp == NULL) { if (*pidp != -1) PROC_UNLOCK(p); return (0); } vref(vp); if (*pidp != -1) PROC_UNLOCK(p); error = vn_fullpath(req->td, vp, &retbuf, &freebuf); vrele(vp); if (error) return (error); error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1); free(freebuf, M_TEMP); return (error); } static int sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS) { struct proc *p; char *sv_name; int *name; int namelen; int error; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error != 0) return (error); sv_name = p->p_sysent->sv_name; PROC_UNLOCK(p); return (sysctl_handle_string(oidp, sv_name, 0, req)); } #ifdef KINFO_OVMENTRY_SIZE CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE); #endif #ifdef COMPAT_FREEBSD7 static int sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS) { vm_map_entry_t entry, tmp_entry; unsigned int last_timestamp; char *fullpath, *freepath; struct kinfo_ovmentry *kve; struct vattr va; struct ucred *cred; int error, *name; struct vnode *vp; struct proc *p; vm_map_t map; struct vmspace *vm; name = (int *)arg1; error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); vm = vmspace_acquire_ref(p); if (vm == NULL) { PRELE(p); return (ESRCH); } kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK); map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { vm_object_t obj, tobj, lobj; vm_offset_t addr; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; bzero(kve, sizeof(*kve)); kve->kve_structsize = sizeof(*kve); kve->kve_private_resident = 0; obj = entry->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); if (obj->shadow_count == 1) kve->kve_private_resident = obj->resident_page_count; } kve->kve_resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(map->pmap, addr)) kve->kve_resident++; addr += PAGE_SIZE; } for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; } kve->kve_start = (void*)entry->start; kve->kve_end = (void*)entry->end; kve->kve_offset = (off_t)entry->offset; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; if (entry->eflags & MAP_ENTRY_NOCOREDUMP) kve->kve_flags |= KVME_FLAG_NOCOREDUMP; last_timestamp = map->timestamp; vm_map_unlock_read(map); kve->kve_fileid = 0; kve->kve_fsid = 0; freepath = NULL; fullpath = ""; if (lobj) { vp = NULL; switch (lobj->type) { case OBJT_DEFAULT: kve->kve_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kve->kve_type = KVME_TYPE_VNODE; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: if ((lobj->flags & OBJ_TMPFS_NODE) != 0) { kve->kve_type = KVME_TYPE_VNODE; if ((lobj->flags & OBJ_TMPFS) != 0) { vp = lobj->un_pager.swp.swp_tmpfs; vref(vp); } } else { kve->kve_type = KVME_TYPE_SWAP; } break; case OBJT_DEVICE: kve->kve_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kve->kve_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kve->kve_type = KVME_TYPE_DEAD; break; case OBJT_SG: kve->kve_type = KVME_TYPE_SG; break; default: kve->kve_type = KVME_TYPE_UNKNOWN; break; } if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); cred = curthread->td_ucred; vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, cred) == 0) { kve->kve_fileid = va.va_fileid; kve->kve_fsid = va.va_fsid; } vput(vp); } } else { kve->kve_type = KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); error = SYSCTL_OUT(req, kve, sizeof(*kve)); vm_map_lock_read(map); if (error) break; if (last_timestamp != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); PRELE(p); free(kve, M_TEMP); return (error); } #endif /* COMPAT_FREEBSD7 */ #ifdef KINFO_VMENTRY_SIZE CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE); #endif static void kern_proc_vmmap_resident(vm_map_t map, vm_map_entry_t entry, struct kinfo_vmentry *kve) { vm_object_t obj, tobj; vm_page_t m, m_adv; vm_offset_t addr; vm_paddr_t locked_pa; vm_pindex_t pi, pi_adv, pindex; locked_pa = 0; obj = entry->object.vm_object; addr = entry->start; m_adv = NULL; pi = OFF_TO_IDX(entry->offset); for (; addr < entry->end; addr += IDX_TO_OFF(pi_adv), pi += pi_adv) { if (m_adv != NULL) { m = m_adv; } else { pi_adv = OFF_TO_IDX(entry->end - addr); pindex = pi; for (tobj = obj;; tobj = tobj->backing_object) { m = vm_page_find_least(tobj, pindex); if (m != NULL) { if (m->pindex == pindex) break; if (pi_adv > m->pindex - pindex) { pi_adv = m->pindex - pindex; m_adv = m; } } if (tobj->backing_object == NULL) goto next; pindex += OFF_TO_IDX(tobj-> backing_object_offset); } } m_adv = NULL; if (m->psind != 0 && addr + pagesizes[1] <= entry->end && (addr & (pagesizes[1] - 1)) == 0 && (pmap_mincore(map->pmap, addr, &locked_pa) & MINCORE_SUPER) != 0) { kve->kve_flags |= KVME_FLAG_SUPER; pi_adv = OFF_TO_IDX(pagesizes[1]); } else { /* * We do not test the found page on validity. * Either the page is busy and being paged in, * or it was invalidated. The first case * should be counted as resident, the second * is not so clear; we do account both. */ pi_adv = 1; } kve->kve_resident += pi_adv; next:; } PA_UNLOCK_COND(locked_pa); } /* * Must be called with the process locked and will return unlocked. */ int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { vm_map_entry_t entry, tmp_entry; struct vattr va; vm_map_t map; vm_object_t obj, tobj, lobj; char *fullpath, *freepath; struct kinfo_vmentry *kve; struct ucred *cred; struct vnode *vp; struct vmspace *vm; vm_offset_t addr; unsigned int last_timestamp; int error; PROC_LOCK_ASSERT(p, MA_OWNED); _PHOLD(p); PROC_UNLOCK(p); vm = vmspace_acquire_ref(p); if (vm == NULL) { PRELE(p); return (ESRCH); } kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK | M_ZERO); error = 0; map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; addr = entry->end; bzero(kve, sizeof(*kve)); obj = entry->object.vm_object; if (obj != NULL) { for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { VM_OBJECT_RLOCK(tobj); lobj = tobj; } if (obj->backing_object == NULL) kve->kve_private_resident = obj->resident_page_count; if (!vmmap_skip_res_cnt) kern_proc_vmmap_resident(map, entry, kve); for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj && tobj != lobj) VM_OBJECT_RUNLOCK(tobj); } } else { lobj = NULL; } kve->kve_start = entry->start; kve->kve_end = entry->end; kve->kve_offset = entry->offset; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; if (entry->eflags & MAP_ENTRY_NOCOREDUMP) kve->kve_flags |= KVME_FLAG_NOCOREDUMP; if (entry->eflags & MAP_ENTRY_GROWS_UP) kve->kve_flags |= KVME_FLAG_GROWS_UP; if (entry->eflags & MAP_ENTRY_GROWS_DOWN) kve->kve_flags |= KVME_FLAG_GROWS_DOWN; last_timestamp = map->timestamp; vm_map_unlock_read(map); freepath = NULL; fullpath = ""; if (lobj != NULL) { vp = NULL; switch (lobj->type) { case OBJT_DEFAULT: kve->kve_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kve->kve_type = KVME_TYPE_VNODE; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: if ((lobj->flags & OBJ_TMPFS_NODE) != 0) { kve->kve_type = KVME_TYPE_VNODE; if ((lobj->flags & OBJ_TMPFS) != 0) { vp = lobj->un_pager.swp.swp_tmpfs; vref(vp); } } else { kve->kve_type = KVME_TYPE_SWAP; } break; case OBJT_DEVICE: kve->kve_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kve->kve_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kve->kve_type = KVME_TYPE_DEAD; break; case OBJT_SG: kve->kve_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kve->kve_type = KVME_TYPE_MGTDEVICE; break; default: kve->kve_type = KVME_TYPE_UNKNOWN; break; } if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); kve->kve_vn_type = vntype_to_kinfo(vp->v_type); cred = curthread->td_ucred; vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, cred) == 0) { kve->kve_vn_fileid = va.va_fileid; kve->kve_vn_fsid = va.va_fsid; kve->kve_vn_mode = MAKEIMODE(va.va_type, va.va_mode); kve->kve_vn_size = va.va_size; kve->kve_vn_rdev = va.va_rdev; kve->kve_status = KF_ATTR_VALID; } vput(vp); } } else { kve->kve_type = KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ if ((flags & KERN_VMMAP_PACK_KINFO) != 0) kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) + strlen(kve->kve_path) + 1; else kve->kve_structsize = sizeof(*kve); kve->kve_structsize = roundup(kve->kve_structsize, sizeof(uint64_t)); /* Halt filling and truncate rather than exceeding maxlen */ if (maxlen != -1 && maxlen < kve->kve_structsize) { error = 0; vm_map_lock_read(map); break; } else if (maxlen != -1) maxlen -= kve->kve_structsize; if (sbuf_bcat(sb, kve, kve->kve_structsize) != 0) error = ENOMEM; vm_map_lock_read(map); if (error != 0) break; if (last_timestamp != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); PRELE(p); free(kve, M_TEMP); return (error); } static int sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS) { struct proc *p; struct sbuf sb; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_vmentry), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } error = kern_proc_vmmap_out(p, &sb, -1, KERN_VMMAP_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #if defined(STACK) || defined(DDB) static int sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS) { struct kinfo_kstack *kkstp; int error, i, *name, numthreads; lwpid_t *lwpidarray; struct thread *td; struct stack *st; struct sbuf sb; struct proc *p; name = (int *)arg1; error = pget((pid_t)name[0], PGET_NOTINEXEC | PGET_WANTREAD, &p); if (error != 0) return (error); kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK); st = stack_create(); lwpidarray = NULL; numthreads = 0; PROC_LOCK(p); repeat: if (numthreads < p->p_numthreads) { if (lwpidarray != NULL) { free(lwpidarray, M_TEMP); lwpidarray = NULL; } numthreads = p->p_numthreads; PROC_UNLOCK(p); lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP, M_WAITOK | M_ZERO); PROC_LOCK(p); goto repeat; } i = 0; /* * XXXRW: During the below loop, execve(2) and countless other sorts * of changes could have taken place. Should we check to see if the * vmspace has been replaced, or the like, in order to prevent * giving a snapshot that spans, say, execve(2), with some threads * before and some after? Among other things, the credentials could * have changed, in which case the right to extract debug info might * no longer be assured. */ FOREACH_THREAD_IN_PROC(p, td) { KASSERT(i < numthreads, ("sysctl_kern_proc_kstack: numthreads")); lwpidarray[i] = td->td_tid; i++; } numthreads = i; for (i = 0; i < numthreads; i++) { td = thread_find(p, lwpidarray[i]); if (td == NULL) { continue; } bzero(kkstp, sizeof(*kkstp)); (void)sbuf_new(&sb, kkstp->kkst_trace, sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN); thread_lock(td); kkstp->kkst_tid = td->td_tid; if (TD_IS_SWAPPED(td)) { kkstp->kkst_state = KKST_STATE_SWAPPED; } else if (TD_IS_RUNNING(td)) { if (stack_save_td_running(st, td) == 0) kkstp->kkst_state = KKST_STATE_STACKOK; else kkstp->kkst_state = KKST_STATE_RUNNING; } else { kkstp->kkst_state = KKST_STATE_STACKOK; stack_save_td(st, td); } thread_unlock(td); PROC_UNLOCK(p); stack_sbuf_print(&sb, st); sbuf_finish(&sb); sbuf_delete(&sb); error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp)); PROC_LOCK(p); if (error) break; } _PRELE(p); PROC_UNLOCK(p); if (lwpidarray != NULL) free(lwpidarray, M_TEMP); stack_destroy(st); free(kkstp, M_TEMP); return (error); } #endif /* * This sysctl allows a process to retrieve the full list of groups from * itself or another process. */ static int sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct ucred *cred; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; PROC_LOCK(p); } else { error = pget(*pidp, PGET_CANSEE, &p); if (error != 0) return (error); } cred = crhold(p->p_ucred); PROC_UNLOCK(p); error = SYSCTL_OUT(req, cred->cr_groups, cred->cr_ngroups * sizeof(gid_t)); crfree(cred); return (error); } /* * This sysctl allows a process to retrieve or/and set the resource limit for * another process. */ static int sysctl_kern_proc_rlimit(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct rlimit rlim; struct proc *p; u_int which; int flags, error; if (namelen != 2) return (EINVAL); which = (u_int)name[1]; if (which >= RLIM_NLIMITS) return (EINVAL); if (req->newptr != NULL && req->newlen != sizeof(rlim)) return (EINVAL); flags = PGET_HOLD | PGET_NOTWEXIT; if (req->newptr != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget((pid_t)name[0], flags, &p); if (error != 0) return (error); /* * Retrieve limit. */ if (req->oldptr != NULL) { PROC_LOCK(p); lim_rlimit_proc(p, which, &rlim); PROC_UNLOCK(p); } error = SYSCTL_OUT(req, &rlim, sizeof(rlim)); if (error != 0) goto errout; /* * Set limit. */ if (req->newptr != NULL) { error = SYSCTL_IN(req, &rlim, sizeof(rlim)); if (error == 0) error = kern_proc_setrlimit(curthread, p, which, &rlim); } errout: PRELE(p); return (error); } /* * This sysctl allows a process to retrieve ps_strings structure location of * another process. */ static int sysctl_kern_proc_ps_strings(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; vm_offset_t ps_strings; int error; #ifdef COMPAT_FREEBSD32 uint32_t ps_strings32; #endif if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if ((req->flags & SCTL_MASK32) != 0) { /* * We return 0 if the 32 bit emulation request is for a 64 bit * process. */ ps_strings32 = SV_PROC_FLAG(p, SV_ILP32) != 0 ? PTROUT(p->p_sysent->sv_psstrings) : 0; PROC_UNLOCK(p); error = SYSCTL_OUT(req, &ps_strings32, sizeof(ps_strings32)); return (error); } #endif ps_strings = p->p_sysent->sv_psstrings; PROC_UNLOCK(p); error = SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings)); return (error); } /* * This sysctl allows a process to retrieve umask of another process. */ static int sysctl_kern_proc_umask(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int error; u_short fd_cmask; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); FILEDESC_SLOCK(p->p_fd); fd_cmask = p->p_fd->fd_cmask; FILEDESC_SUNLOCK(p->p_fd); PRELE(p); error = SYSCTL_OUT(req, &fd_cmask, sizeof(fd_cmask)); return (error); } /* * This sysctl allows a process to set and retrieve binary osreldate of * another process. */ static int sysctl_kern_proc_osrel(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int flags, error, osrel; if (namelen != 1) return (EINVAL); if (req->newptr != NULL && req->newlen != sizeof(osrel)) return (EINVAL); flags = PGET_HOLD | PGET_NOTWEXIT; if (req->newptr != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget((pid_t)name[0], flags, &p); if (error != 0) return (error); error = SYSCTL_OUT(req, &p->p_osrel, sizeof(p->p_osrel)); if (error != 0) goto errout; if (req->newptr != NULL) { error = SYSCTL_IN(req, &osrel, sizeof(osrel)); if (error != 0) goto errout; if (osrel < 0) { error = EINVAL; goto errout; } p->p_osrel = osrel; } errout: PRELE(p); return (error); } static int sysctl_kern_proc_sigtramp(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct kinfo_sigtramp kst; const struct sysentvec *sv; int error; #ifdef COMPAT_FREEBSD32 struct kinfo_sigtramp32 kst32; #endif if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); sv = p->p_sysent; #ifdef COMPAT_FREEBSD32 if ((req->flags & SCTL_MASK32) != 0) { bzero(&kst32, sizeof(kst32)); if (SV_PROC_FLAG(p, SV_ILP32)) { if (sv->sv_sigcode_base != 0) { kst32.ksigtramp_start = sv->sv_sigcode_base; kst32.ksigtramp_end = sv->sv_sigcode_base + *sv->sv_szsigcode; } else { kst32.ksigtramp_start = sv->sv_psstrings - *sv->sv_szsigcode; kst32.ksigtramp_end = sv->sv_psstrings; } } PROC_UNLOCK(p); error = SYSCTL_OUT(req, &kst32, sizeof(kst32)); return (error); } #endif bzero(&kst, sizeof(kst)); if (sv->sv_sigcode_base != 0) { kst.ksigtramp_start = (char *)sv->sv_sigcode_base; kst.ksigtramp_end = (char *)sv->sv_sigcode_base + *sv->sv_szsigcode; } else { kst.ksigtramp_start = (char *)sv->sv_psstrings - *sv->sv_szsigcode; kst.ksigtramp_end = (char *)sv->sv_psstrings; } PROC_UNLOCK(p); error = SYSCTL_OUT(req, &kst, sizeof(kst)); return (error); } SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT| CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Return process table, no threads"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_CAPWR | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_args, "Process argument list"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_env, "Process environment"); static SYSCTL_NODE(_kern_proc, KERN_PROC_AUXV, auxv, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_auxv, "Process ELF auxiliary vector"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD), sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Return process table, no threads"); #ifdef COMPAT_FREEBSD7 static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries"); #endif static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries"); #if defined(STACK) || defined(DDB) static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks"); #endif static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RLIMIT, rlimit, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_rlimit, "Process resource limits"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PS_STRINGS, ps_strings, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_ps_strings, "Process ps_strings location"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UMASK, umask, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_umask, "Process umask"); static SYSCTL_NODE(_kern_proc, KERN_PROC_OSREL, osrel, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_osrel, "Process binary osreldate"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SIGTRAMP, sigtramp, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_sigtramp, "Process signal trampoline location"); int allproc_gen; void stop_all_proc(void) { struct proc *cp, *p; int r, gen; bool restart, seen_stopped, seen_exiting, stopped_some; cp = curproc; /* * stop_all_proc() assumes that all process which have * usermode must be stopped, except current process, for * obvious reasons. Since other threads in the process * establishing global stop could unstop something, disable * calls from multithreaded processes as precaution. The * service must not be user-callable anyway. */ KASSERT((cp->p_flag & P_HADTHREADS) == 0 || (cp->p_flag & P_KTHREAD) != 0, ("mt stop_all_proc")); allproc_loop: sx_xlock(&allproc_lock); gen = allproc_gen; seen_exiting = seen_stopped = stopped_some = restart = false; LIST_REMOVE(cp, p_list); LIST_INSERT_HEAD(&allproc, cp, p_list); for (;;) { p = LIST_NEXT(cp, p_list); if (p == NULL) break; LIST_REMOVE(cp, p_list); LIST_INSERT_AFTER(p, cp, p_list); PROC_LOCK(p); if ((p->p_flag & (P_KTHREAD | P_SYSTEM | P_TOTAL_STOP)) != 0) { PROC_UNLOCK(p); continue; } if ((p->p_flag & P_WEXIT) != 0) { seen_exiting = true; PROC_UNLOCK(p); continue; } if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { /* * Stopped processes are tolerated when there * are no other processes which might continue * them. P_STOPPED_SINGLE but not * P_TOTAL_STOP process still has at least one * thread running. */ seen_stopped = true; PROC_UNLOCK(p); continue; } _PHOLD(p); sx_xunlock(&allproc_lock); r = thread_single(p, SINGLE_ALLPROC); if (r != 0) restart = true; else stopped_some = true; _PRELE(p); PROC_UNLOCK(p); sx_xlock(&allproc_lock); } /* Catch forked children we did not see in iteration. */ if (gen != allproc_gen) restart = true; sx_xunlock(&allproc_lock); if (restart || stopped_some || seen_exiting || seen_stopped) { kern_yield(PRI_USER); goto allproc_loop; } } void resume_all_proc(void) { struct proc *cp, *p; cp = curproc; sx_xlock(&allproc_lock); LIST_REMOVE(cp, p_list); LIST_INSERT_HEAD(&allproc, cp, p_list); for (;;) { p = LIST_NEXT(cp, p_list); if (p == NULL) break; LIST_REMOVE(cp, p_list); LIST_INSERT_AFTER(p, cp, p_list); PROC_LOCK(p); if ((p->p_flag & P_TOTAL_STOP) != 0) { sx_xunlock(&allproc_lock); _PHOLD(p); thread_single_end(p, SINGLE_ALLPROC); _PRELE(p); PROC_UNLOCK(p); sx_xlock(&allproc_lock); } else { PROC_UNLOCK(p); } } sx_xunlock(&allproc_lock); } #define TOTAL_STOP_DEBUG 1 #ifdef TOTAL_STOP_DEBUG volatile static int ap_resume; #include static int sysctl_debug_stop_all_proc(SYSCTL_HANDLER_ARGS) { int error, val; val = 0; ap_resume = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0) { stop_all_proc(); syncer_suspend(); while (ap_resume == 0) ; syncer_resume(); resume_all_proc(); } return (0); } SYSCTL_PROC(_debug, OID_AUTO, stop_all_proc, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, __DEVOLATILE(int *, &ap_resume), 0, sysctl_debug_stop_all_proc, "I", ""); #endif Index: head/sys/kern/kern_racct.c =================================================================== --- head/sys/kern/kern_racct.c (revision 292383) +++ head/sys/kern/kern_racct.c (revision 292384) @@ -1,1316 +1,1317 @@ /*- * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #include #endif #ifdef RACCT FEATURE(racct, "Resource Accounting"); /* * Do not block processes that have their %cpu usage <= pcpu_threshold. */ static int pcpu_threshold = 1; #ifdef RACCT_DEFAULT_TO_DISABLED int racct_enable = 0; #else int racct_enable = 1; #endif SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 0, "Enable RACCT/RCTL"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 0, "Processes with higher %cpu usage than this value can be throttled."); /* * How many seconds it takes to use the scheduler %cpu calculations. When a * process starts, we compute its %cpu usage by dividing its runtime by the * process wall clock time. After RACCT_PCPU_SECS pass, we use the value * provided by the scheduler. */ #define RACCT_PCPU_SECS 3 static struct mtx racct_lock; MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); static uma_zone_t racct_zone; static void racct_sub_racct(struct racct *dest, const struct racct *src); static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); SDT_PROVIDER_DEFINE(racct); -SDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int", - "uint64_t"); -SDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure, +SDT_PROBE_DEFINE3(racct, , rusage, add, "struct proc *", "int", "uint64_t"); -SDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *", - "int", "uint64_t"); -SDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *", - "int", "uint64_t"); -SDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int", - "uint64_t"); -SDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure, +SDT_PROBE_DEFINE3(racct, , rusage, add__failure, "struct proc *", "int", "uint64_t"); -SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int", - "uint64_t"); -SDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *", - "int", "uint64_t"); -SDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *"); -SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *"); -SDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *", +SDT_PROBE_DEFINE3(racct, , rusage, add__cred, + "struct ucred *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(racct, , rusage, add__force, + "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(racct, , rusage, set, + "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(racct, , rusage, set__failure, + "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(racct, , rusage, sub, + "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, + "struct ucred *", "int", "uint64_t"); +SDT_PROBE_DEFINE1(racct, , racct, create, "struct racct *"); -SDT_PROBE_DEFINE2(racct, kernel, racct, join__failure, - "struct racct *", "struct racct *"); -SDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *", +SDT_PROBE_DEFINE1(racct, , racct, destroy, "struct racct *"); +SDT_PROBE_DEFINE2(racct, , racct, join, + "struct racct *", "struct racct *"); +SDT_PROBE_DEFINE2(racct, , racct, join__failure, + "struct racct *", "struct racct *"); +SDT_PROBE_DEFINE2(racct, , racct, leave, + "struct racct *", "struct racct *"); int racct_types[] = { [RACCT_CPU] = RACCT_IN_MILLIONS, [RACCT_DATA] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_STACK] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_CORE] = RACCT_DENIABLE, [RACCT_RSS] = RACCT_RECLAIMABLE, [RACCT_MEMLOCK] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NPROC] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NOFILE] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_VMEM] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NPTS] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SWAP] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NTHR] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_MSGQQUEUED] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_MSGQSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NMSGQ] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEMOP] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NSHM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SHMSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; #ifdef SCHED_4BSD /* * Contains intermediate values for %cpu calculations to avoid using floating * point in the kernel. * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to * zero so the calculations are more straightforward. */ fixpt_t ccpu_exp[] = { [0] = FSCALE * 1, [1] = FSCALE * 0.95122942450071400909, [2] = FSCALE * 0.90483741803595957316, [3] = FSCALE * 0.86070797642505780722, [4] = FSCALE * 0.81873075307798185866, [5] = FSCALE * 0.77880078307140486824, [6] = FSCALE * 0.74081822068171786606, [7] = FSCALE * 0.70468808971871343435, [8] = FSCALE * 0.67032004603563930074, [9] = FSCALE * 0.63762815162177329314, [10] = FSCALE * 0.60653065971263342360, [11] = FSCALE * 0.57694981038048669531, [12] = FSCALE * 0.54881163609402643262, [13] = FSCALE * 0.52204577676101604789, [14] = FSCALE * 0.49658530379140951470, [15] = FSCALE * 0.47236655274101470713, [16] = FSCALE * 0.44932896411722159143, [17] = FSCALE * 0.42741493194872666992, [18] = FSCALE * 0.40656965974059911188, [19] = FSCALE * 0.38674102345450120691, [20] = FSCALE * 0.36787944117144232159, [21] = FSCALE * 0.34993774911115535467, [22] = FSCALE * 0.33287108369807955328, [23] = FSCALE * 0.31663676937905321821, [24] = FSCALE * 0.30119421191220209664, [25] = FSCALE * 0.28650479686019010032, [26] = FSCALE * 0.27253179303401260312, [27] = FSCALE * 0.25924026064589150757, [28] = FSCALE * 0.24659696394160647693, [29] = FSCALE * 0.23457028809379765313, [30] = FSCALE * 0.22313016014842982893, [31] = FSCALE * 0.21224797382674305771, [32] = FSCALE * 0.20189651799465540848, [33] = FSCALE * 0.19204990862075411423, [34] = FSCALE * 0.18268352405273465022, [35] = FSCALE * 0.17377394345044512668, [36] = FSCALE * 0.16529888822158653829, [37] = FSCALE * 0.15723716631362761621, [38] = FSCALE * 0.14956861922263505264, [39] = FSCALE * 0.14227407158651357185, [40] = FSCALE * 0.13533528323661269189, [41] = FSCALE * 0.12873490358780421886, [42] = FSCALE * 0.12245642825298191021, [43] = FSCALE * 0.11648415777349695786, [44] = FSCALE * 0.11080315836233388333, [45] = FSCALE * 0.10539922456186433678, [46] = FSCALE * 0.10025884372280373372, [47] = FSCALE * 0.09536916221554961888, [48] = FSCALE * 0.09071795328941250337, [49] = FSCALE * 0.08629358649937051097, [50] = FSCALE * 0.08208499862389879516, [51] = FSCALE * 0.07808166600115315231, [52] = FSCALE * 0.07427357821433388042, [53] = FSCALE * 0.07065121306042958674, [54] = FSCALE * 0.06720551273974976512, [55] = FSCALE * 0.06392786120670757270, [56] = FSCALE * 0.06081006262521796499, [57] = FSCALE * 0.05784432087483846296, [58] = FSCALE * 0.05502322005640722902, [59] = FSCALE * 0.05233970594843239308, [60] = FSCALE * 0.04978706836786394297, [61] = FSCALE * 0.04735892439114092119, [62] = FSCALE * 0.04504920239355780606, [63] = FSCALE * 0.04285212686704017991, [64] = FSCALE * 0.04076220397836621516, [65] = FSCALE * 0.03877420783172200988, [66] = FSCALE * 0.03688316740124000544, [67] = FSCALE * 0.03508435410084502588, [68] = FSCALE * 0.03337326996032607948, [69] = FSCALE * 0.03174563637806794323, [70] = FSCALE * 0.03019738342231850073, [71] = FSCALE * 0.02872463965423942912, [72] = FSCALE * 0.02732372244729256080, [73] = FSCALE * 0.02599112877875534358, [74] = FSCALE * 0.02472352647033939120, [75] = FSCALE * 0.02351774585600910823, [76] = FSCALE * 0.02237077185616559577, [77] = FSCALE * 0.02127973643837716938, [78] = FSCALE * 0.02024191144580438847, [79] = FSCALE * 0.01925470177538692429, [80] = FSCALE * 0.01831563888873418029, [81] = FSCALE * 0.01742237463949351138, [82] = FSCALE * 0.01657267540176124754, [83] = FSCALE * 0.01576441648485449082, [84] = FSCALE * 0.01499557682047770621, [85] = FSCALE * 0.01426423390899925527, [86] = FSCALE * 0.01356855901220093175, [87] = FSCALE * 0.01290681258047986886, [88] = FSCALE * 0.01227733990306844117, [89] = FSCALE * 0.01167856697039544521, [90] = FSCALE * 0.01110899653824230649, [91] = FSCALE * 0.01056720438385265337, [92] = FSCALE * 0.01005183574463358164, [93] = FSCALE * 0.00956160193054350793, [94] = FSCALE * 0.00909527710169581709, [95] = FSCALE * 0.00865169520312063417, [96] = FSCALE * 0.00822974704902002884, [97] = FSCALE * 0.00782837754922577143, [98] = FSCALE * 0.00744658307092434051, [99] = FSCALE * 0.00708340892905212004, [100] = FSCALE * 0.00673794699908546709, [101] = FSCALE * 0.00640933344625638184, [102] = FSCALE * 0.00609674656551563610, [103] = FSCALE * 0.00579940472684214321, [104] = FSCALE * 0.00551656442076077241, [105] = FSCALE * 0.00524751839918138427, [106] = FSCALE * 0.00499159390691021621, [107] = FSCALE * 0.00474815099941147558, [108] = FSCALE * 0.00451658094261266798, [109] = FSCALE * 0.00429630469075234057, [110] = FSCALE * 0.00408677143846406699, }; #endif #define CCPU_EXP_MAX 110 /* * This function is analogical to the getpcpu() function in the ps(1) command. * They should both calculate in the same way so that the racct %cpu * calculations are consistent with the values showed by the ps(1) tool. * The calculations are more complex in the 4BSD scheduler because of the value * of the ccpu variable. In ULE it is defined to be zero which saves us some * work. */ static uint64_t racct_getpcpu(struct proc *p, u_int pcpu) { u_int swtime; #ifdef SCHED_4BSD fixpt_t pctcpu, pctcpu_next; #endif #ifdef SMP struct pcpu *pc; int found; #endif fixpt_t p_pctcpu; struct thread *td; ASSERT_RACCT_ENABLED(); /* * If the process is swapped out, we count its %cpu usage as zero. * This behaviour is consistent with the userland ps(1) tool. */ if ((p->p_flag & P_INMEM) == 0) return (0); swtime = (ticks - p->p_swtick) / hz; /* * For short-lived processes, the sched_pctcpu() returns small * values even for cpu intensive processes. Therefore we use * our own estimate in this case. */ if (swtime < RACCT_PCPU_SECS) return (pcpu); p_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { if (td == PCPU_GET(idlethread)) continue; #ifdef SMP found = 0; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (td == pc->pc_idlethread) { found = 1; break; } } if (found) continue; #endif thread_lock(td); #ifdef SCHED_4BSD pctcpu = sched_pctcpu(td); /* Count also the yet unfinished second. */ pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; pctcpu_next += sched_pctcpu_delta(td); p_pctcpu += max(pctcpu, pctcpu_next); #else /* * In ULE the %cpu statistics are updated on every * sched_pctcpu() call. So special calculations to * account for the latest (unfinished) second are * not needed. */ p_pctcpu += sched_pctcpu(td); #endif thread_unlock(td); } #ifdef SCHED_4BSD if (swtime <= CCPU_EXP_MAX) return ((100 * (uint64_t)p_pctcpu * 1000000) / (FSCALE - ccpu_exp[swtime])); #endif return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); } static void racct_add_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); dest->r_resources[i] += src->r_resources[i]; } } static void racct_sub_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); KASSERT(src->r_resources[i] <= dest->r_resources[i], ("%s: resource %d propagation meltdown: src > dest", __func__, i)); } if (RACCT_CAN_DROP(i)) { dest->r_resources[i] -= src->r_resources[i]; if (dest->r_resources[i] < 0) { KASSERT(RACCT_IS_SLOPPY(i) || RACCT_IS_DECAYING(i), ("%s: resource %d usage < 0", __func__, i)); dest->r_resources[i] = 0; } } } } void racct_create(struct racct **racctp) { if (!racct_enable) return; - SDT_PROBE1(racct, kernel, racct, create, racctp); + SDT_PROBE1(racct, , racct, create, racctp); KASSERT(*racctp == NULL, ("racct already allocated")); *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); } static void racct_destroy_locked(struct racct **racctp) { int i; struct racct *racct; ASSERT_RACCT_ENABLED(); - SDT_PROBE1(racct, kernel, racct, destroy, racctp); + SDT_PROBE1(racct, , racct, destroy, racctp); mtx_assert(&racct_lock, MA_OWNED); KASSERT(racctp != NULL, ("NULL racctp")); KASSERT(*racctp != NULL, ("NULL racct")); racct = *racctp; for (i = 0; i <= RACCT_MAX; i++) { if (RACCT_IS_SLOPPY(i)) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; KASSERT(racct->r_resources[i] == 0, ("destroying non-empty racct: " "%ju allocated for resource %d\n", racct->r_resources[i], i)); } uma_zfree(racct_zone, racct); *racctp = NULL; } void racct_destroy(struct racct **racct) { if (!racct_enable) return; mtx_lock(&racct_lock); racct_destroy_locked(racct); mtx_unlock(&racct_lock); } /* * Increase consumption of 'resource' by 'amount' for 'racct', * but not its parents. Differently from other cases, 'amount' here * may be less than zero. */ static void racct_adjust_resource(struct racct *racct, int resource, int64_t amount) { ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); KASSERT(racct != NULL, ("NULL racct")); racct->r_resources[resource] += amount; if (racct->r_resources[resource] < 0) { KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } /* * There are some cases where the racct %cpu resource would grow * beyond 100% per core. For example in racct_proc_exit() we add * the process %cpu usage to the ucred racct containers. If too * many processes terminated in a short time span, the ucred %cpu * resource could grow too much. Also, the 4BSD scheduler sometimes * returns for a thread more than 100% cpu usage. So we set a sane * boundary here to 100% * the maxumum number of CPUs. */ if ((resource == RACCT_PCTCPU) && (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; } static int racct_add_locked(struct proc *p, int resource, uint64_t amount) { #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); - SDT_PROBE3(racct, kernel, rusage, add, p, resource, amount); + SDT_PROBE3(racct, , rusage, add, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef RCTL error = rctl_enforce(p, resource, amount); if (error && RACCT_IS_DENIABLE(resource)) { - SDT_PROBE3(racct, kernel, rusage, add__failure, p, resource, - amount); + SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); return (error); } #endif racct_adjust_resource(p->p_racct, resource, amount); racct_add_cred_locked(p->p_ucred, resource, amount); return (0); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. */ int racct_add(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); mtx_lock(&racct_lock); error = racct_add_locked(p, resource, amount); mtx_unlock(&racct_lock); return (error); } static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); - SDT_PROBE3(racct, kernel, rusage, add__cred, cred, resource, amount); + SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); } /* * Increase allocation of 'resource' by 'amount' for credential 'cred'. * Doesn't check for limits and never fails. * * XXX: Shouldn't this ever return an error? */ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; mtx_lock(&racct_lock); racct_add_cred_locked(cred, resource, amount); mtx_unlock(&racct_lock); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Doesn't check for limits and never fails. */ void racct_add_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; - SDT_PROBE3(racct, kernel, rusage, add__force, p, resource, amount); + SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); mtx_lock(&racct_lock); racct_adjust_resource(p->p_racct, resource, amount); racct_add_cred_locked(p->p_ucred, resource, amount); mtx_unlock(&racct_lock); } static int racct_set_locked(struct proc *p, int resource, uint64_t amount) { int64_t old_amount, decayed_amount; int64_t diff_proc, diff_cred; #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); - SDT_PROBE3(racct, kernel, rusage, set, p, resource, amount); + SDT_PROBE3(racct, , rusage, set, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); old_amount = p->p_racct->r_resources[resource]; /* * The diffs may be negative. */ diff_proc = amount - old_amount; if (RACCT_IS_DECAYING(resource)) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, resource)); #endif #ifdef RCTL if (diff_proc > 0) { error = rctl_enforce(p, resource, diff_proc); if (error && RACCT_IS_DENIABLE(resource)) { - SDT_PROBE3(racct, kernel, rusage, set__failure, p, - resource, amount); + SDT_PROBE3(racct, , rusage, set__failure, p, resource, + amount); return (error); } } #endif racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); else if (diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); return (0); } /* * Set allocation of 'resource' to 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. * * Note that decreasing the allocation always returns 0, * even if it's above the limit. */ int racct_set(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); mtx_lock(&racct_lock); error = racct_set_locked(p, resource, amount); mtx_unlock(&racct_lock); return (error); } static void racct_set_force_locked(struct proc *p, int resource, uint64_t amount) { int64_t old_amount, decayed_amount; int64_t diff_proc, diff_cred; ASSERT_RACCT_ENABLED(); - SDT_PROBE3(racct, kernel, rusage, set, p, resource, amount); + SDT_PROBE3(racct, , rusage, set, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); old_amount = p->p_racct->r_resources[resource]; /* * The diffs may be negative. */ diff_proc = amount - old_amount; if (RACCT_IS_DECAYING(resource)) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); else if (diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); } void racct_set_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; mtx_lock(&racct_lock); racct_set_force_locked(p, resource, amount); mtx_unlock(&racct_lock); } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * not matter. */ uint64_t racct_get_limit(struct proc *p, int resource) { if (!racct_enable) return (UINT64_MAX); #ifdef RCTL return (rctl_get_limit(p, resource)); #else return (UINT64_MAX); #endif } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * matter. */ uint64_t racct_get_available(struct proc *p, int resource) { if (!racct_enable) return (UINT64_MAX); #ifdef RCTL return (rctl_get_available(p, resource)); #else return (UINT64_MAX); #endif } /* * Returns amount of the %cpu resource that process 'p' can add to its %cpu * utilization. Adding more than that would lead to the process being * throttled. */ static int64_t racct_pcpu_available(struct proc *p) { ASSERT_RACCT_ENABLED(); #ifdef RCTL return (rctl_pcpu_available(p)); #else return (INT64_MAX); #endif } /* * Decrease allocation of 'resource' by 'amount' for process 'p'. */ void racct_sub(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; - SDT_PROBE3(racct, kernel, rusage, sub, p, resource, amount); + SDT_PROBE3(racct, , rusage, sub, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(RACCT_CAN_DROP(resource), ("%s: called for non-droppable resource %d", __func__, resource)); mtx_lock(&racct_lock); KASSERT(amount <= p->p_racct->r_resources[resource], ("%s: freeing %ju of resource %d, which is more " "than allocated %jd for %s (pid %d)", __func__, amount, resource, (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); racct_adjust_resource(p->p_racct, resource, -amount); racct_sub_cred_locked(p->p_ucred, resource, amount); mtx_unlock(&racct_lock); } static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); - SDT_PROBE3(racct, kernel, rusage, sub__cred, cred, resource, amount); + SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); #ifdef notyet KASSERT(RACCT_CAN_DROP(resource), ("%s: called for resource %d which can not drop", __func__, resource)); #endif racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, -amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); } /* * Decrease allocation of 'resource' by 'amount' for credential 'cred'. */ void racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; mtx_lock(&racct_lock); racct_sub_cred_locked(cred, resource, amount); mtx_unlock(&racct_lock); } /* * Inherit resource usage information from the parent process. */ int racct_proc_fork(struct proc *parent, struct proc *child) { int i, error = 0; if (!racct_enable) return (0); /* * Create racct for the child process. */ racct_create(&child->p_racct); PROC_LOCK(parent); PROC_LOCK(child); mtx_lock(&racct_lock); #ifdef RCTL error = rctl_proc_fork(parent, child); if (error != 0) goto out; #endif /* Init process cpu time. */ child->p_prev_runtime = 0; child->p_throttled = 0; /* * Inherit resource usage. */ for (i = 0; i <= RACCT_MAX; i++) { if (parent->p_racct->r_resources[i] == 0 || !RACCT_IS_INHERITABLE(i)) continue; error = racct_set_locked(child, i, parent->p_racct->r_resources[i]); if (error != 0) goto out; } error = racct_add_locked(child, RACCT_NPROC, 1); error += racct_add_locked(child, RACCT_NTHR, 1); out: mtx_unlock(&racct_lock); PROC_UNLOCK(child); PROC_UNLOCK(parent); if (error != 0) racct_proc_exit(child); return (error); } /* * Called at the end of fork1(), to handle rules that require the process * to be fully initialized. */ void racct_proc_fork_done(struct proc *child) { #ifdef RCTL if (!racct_enable) return; PROC_LOCK(child); mtx_lock(&racct_lock); rctl_enforce(child, RACCT_NPROC, 0); rctl_enforce(child, RACCT_NTHR, 0); mtx_unlock(&racct_lock); PROC_UNLOCK(child); #endif } void racct_proc_exit(struct proc *p) { int i; uint64_t runtime; struct timeval wallclock; uint64_t pct_estimate, pct; if (!racct_enable) return; PROC_LOCK(p); /* * We don't need to calculate rux, proc_reap() has already done this. */ runtime = cputick2usec(p->p_rux.rux_runtime); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); mtx_lock(&racct_lock); racct_set_locked(p, RACCT_CPU, runtime); racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); for (i = 0; i <= RACCT_MAX; i++) { if (p->p_racct->r_resources[i] == 0) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; racct_set_locked(p, i, 0); } mtx_unlock(&racct_lock); PROC_UNLOCK(p); #ifdef RCTL rctl_racct_release(p->p_racct); #endif racct_destroy(&p->p_racct); } /* * Called after credentials change, to move resource utilisation * between raccts. */ void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred) { struct uidinfo *olduip, *newuip; struct loginclass *oldlc, *newlc; struct prison *oldpr, *newpr, *pr; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_NOTOWNED); newuip = newcred->cr_ruidinfo; olduip = oldcred->cr_ruidinfo; newlc = newcred->cr_loginclass; oldlc = oldcred->cr_loginclass; newpr = newcred->cr_prison; oldpr = oldcred->cr_prison; mtx_lock(&racct_lock); if (newuip != olduip) { racct_sub_racct(olduip->ui_racct, p->p_racct); racct_add_racct(newuip->ui_racct, p->p_racct); } if (newlc != oldlc) { racct_sub_racct(oldlc->lc_racct, p->p_racct); racct_add_racct(newlc->lc_racct, p->p_racct); } if (newpr != oldpr) { for (pr = oldpr; pr != NULL; pr = pr->pr_parent) racct_sub_racct(pr->pr_prison_racct->prr_racct, p->p_racct); for (pr = newpr; pr != NULL; pr = pr->pr_parent) racct_add_racct(pr->pr_prison_racct->prr_racct, p->p_racct); } mtx_unlock(&racct_lock); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); #endif } void racct_move(struct racct *dest, struct racct *src) { ASSERT_RACCT_ENABLED(); mtx_lock(&racct_lock); racct_add_racct(dest, src); racct_sub_racct(src, src); mtx_unlock(&racct_lock); } static void racct_proc_throttle(struct proc *p) { struct thread *td; #ifdef SMP int cpuid; #endif ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); /* * Do not block kernel processes. Also do not block processes with * low %cpu utilization to improve interactivity. */ if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) || (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) return; p->p_throttled = 1; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); switch (td->td_state) { case TDS_RUNQ: /* * If the thread is on the scheduler run-queue, we can * not just remove it from there. So we set the flag * TDF_NEEDRESCHED for the thread, so that once it is * running, it is taken off the cpu as soon as possible. */ td->td_flags |= TDF_NEEDRESCHED; break; case TDS_RUNNING: /* * If the thread is running, we request a context * switch for it by setting the TDF_NEEDRESCHED flag. */ td->td_flags |= TDF_NEEDRESCHED; #ifdef SMP cpuid = td->td_oncpu; if ((cpuid != NOCPU) && (td != curthread)) ipi_cpu(cpuid, IPI_AST); #endif break; default: break; } thread_unlock(td); } } static void racct_proc_wakeup(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_throttled) { p->p_throttled = 0; wakeup(p->p_racct); } } static void racct_decay_resource(struct racct *racct, void * res, void* dummy) { int resource; int64_t r_old, r_new; ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); resource = *(int *)res; r_old = racct->r_resources[resource]; /* If there is nothing to decay, just exit. */ if (r_old <= 0) return; r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; racct->r_resources[resource] = r_new; } static void racct_decay_pre(void) { mtx_lock(&racct_lock); } static void racct_decay_post(void) { mtx_unlock(&racct_lock); } static void racct_decay(int resource) { ASSERT_RACCT_ENABLED(); ui_racct_foreach(racct_decay_resource, racct_decay_pre, racct_decay_post, &resource, NULL); loginclass_racct_foreach(racct_decay_resource, racct_decay_pre, racct_decay_post, &resource, NULL); prison_racct_foreach(racct_decay_resource, racct_decay_pre, racct_decay_post, &resource, NULL); } static void racctd(void) { struct thread *td; struct proc *p; struct timeval wallclock; uint64_t runtime; uint64_t pct, pct_estimate; ASSERT_RACCT_ENABLED(); for (;;) { racct_decay(RACCT_PCTCPU); sx_slock(&allproc_lock); LIST_FOREACH(p, &zombproc, p_list) { PROC_LOCK(p); racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); } FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); continue; } microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) ruxagg(p, td); runtime = cputick2usec(p->p_rux.rux_runtime); PROC_STATUNLOCK(p); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif p->p_prev_runtime = runtime; if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); mtx_lock(&racct_lock); racct_set_force_locked(p, RACCT_PCTCPU, pct); racct_set_locked(p, RACCT_CPU, runtime); racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); mtx_unlock(&racct_lock); PROC_UNLOCK(p); } /* * To ensure that processes are throttled in a fair way, we need * to iterate over all processes again and check the limits * for %cpu resource only after ucred racct containers have been * properly filled. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); continue; } if (racct_pcpu_available(p) <= 0) racct_proc_throttle(p); else if (p->p_throttled) racct_proc_wakeup(p); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); pause("-", hz); } } static struct kproc_desc racctd_kp = { "racctd", racctd, NULL }; static void racctd_init(void) { if (!racct_enable) return; kproc_start(&racctd_kp); } SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void racct_init(void) { if (!racct_enable) return; racct_zone = uma_zcreate("racct", sizeof(struct racct), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* * XXX: Move this somewhere. */ prison0.pr_prison_racct = prison_racct_find("0"); } SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); #endif /* !RACCT */ Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 292383) +++ head/sys/kern/kern_sig.c (revision 292384) @@ -1,3575 +1,3575 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_gzio.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); -SDT_PROBE_DEFINE3(proc, kernel, , signal__send, "struct thread *", - "struct proc *", "int"); -SDT_PROBE_DEFINE2(proc, kernel, , signal__clear, "int", - "ksiginfo_t *"); -SDT_PROBE_DEFINE3(proc, kernel, , signal__discard, +SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); +SDT_PROBE_DEFINE2(proc, , , signal__clear, + "int", "ksiginfo_t *"); +SDT_PROBE_DEFINE3(proc, , , signal__discard, + "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static void sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SA_KILL 0x01 /* terminates process by default */ #define SA_CORE 0x02 /* ditto and coredumps */ #define SA_STOP 0x04 /* suspend process */ #define SA_TTYSTOP 0x08 /* ditto, from tty */ #define SA_IGNORE 0x10 /* ignore by default */ #define SA_CONT 0x20 /* continue if suspended */ #define SA_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { SA_KILL, /* SIGHUP */ SA_KILL, /* SIGINT */ SA_KILL|SA_CORE, /* SIGQUIT */ SA_KILL|SA_CORE, /* SIGILL */ SA_KILL|SA_CORE, /* SIGTRAP */ SA_KILL|SA_CORE, /* SIGABRT */ SA_KILL|SA_CORE, /* SIGEMT */ SA_KILL|SA_CORE, /* SIGFPE */ SA_KILL, /* SIGKILL */ SA_KILL|SA_CORE, /* SIGBUS */ SA_KILL|SA_CORE, /* SIGSEGV */ SA_KILL|SA_CORE, /* SIGSYS */ SA_KILL, /* SIGPIPE */ SA_KILL, /* SIGALRM */ SA_KILL, /* SIGTERM */ SA_IGNORE, /* SIGURG */ SA_STOP, /* SIGSTOP */ SA_STOP|SA_TTYSTOP, /* SIGTSTP */ SA_IGNORE|SA_CONT, /* SIGCONT */ SA_IGNORE, /* SIGCHLD */ SA_STOP|SA_TTYSTOP, /* SIGTTIN */ SA_STOP|SA_TTYSTOP, /* SIGTTOU */ SA_IGNORE, /* SIGIO */ SA_KILL, /* SIGXCPU */ SA_KILL, /* SIGXFSZ */ SA_KILL, /* SIGVTALRM */ SA_KILL, /* SIGPROF */ SA_IGNORE, /* SIGWINCH */ SA_IGNORE, /* SIGINFO */ SA_KILL, /* SIGUSR1 */ SA_KILL, /* SIGUSR2 */ }; static void reschedule_signals(struct proc *p, sigset_t block, int flags); static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_kill, signo)) { count++; SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { if (ret != 0) SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } if (ret != 0) return (ret); out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { struct proc *p; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } } int sigonstack(size_t sp) { struct thread *td = curthread; return ((td->td_pflags & TDP_ALTSTACK) ? #if defined(COMPAT_43) ((td->td_sigstk.ss_size == 0) ? (td->td_sigstk.ss_flags & SS_ONSTACK) : ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)) #else ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size) #endif : 0); } static __inline int sigprop(int sig) { if (sig > 0 && sig < NSIG) return (sigproptbl[_SIG_IDX(sig)]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; oact->sa_flags = 0; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SA_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(td, uap) struct thread *td; register struct sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(td, uap) struct thread *td; register struct freebsd4_sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(td, uap) struct thread *td; register struct osigaction_args *uap; { struct osigaction sa; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(td, uap) struct thread *td; struct osigreturn_args *uap; { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(p) struct proc *p; { register int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SA_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SA_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { sigset_t osigignore; struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); sigdflt(ps, sig); if ((sigprop(sig) & SA_IGNORE) != 0) sigqueue_delete_proc(p, sig); } /* * As CloudABI processes cannot modify signal handlers, fully * reset all signals to their default behavior. Do ignore * SIGPIPE, as it would otherwise be impossible to recover from * writes to broken pipes and sockets. */ if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) { osigignore = ps->ps_sigignore; while (SIGNOTEMPTY(osigignore)) { sig = sig_ffs(&osigignore); SIGDELSET(osigignore, sig); if (sig != SIGPIPE) sigdflt(ps, sig); } SIGADDSET(ps->ps_sigignore, SIGPIPE); } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(td, uap) register struct thread *td; struct sigprocmask_args *uap; { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(td, uap) register struct thread *td; struct osigprocmask_args *uap; { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) error = ERESTART; if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timo, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; p = td->td_proc; error = 0; ets.tv_sec = 0; ets.tv_nsec = 0; if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); ets = rts; timespecadd(&ets, timeout); } } ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL) { if (!timevalid) { error = EINVAL; break; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; break; } ts = ets; timespecsub(&ts, &rts); TIMESPEC_TO_TIMEVAL(&tv, &ts); timo = tvtohz(&tv); } else { timo = 0; } error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo); if (timeout != NULL) { if (error == ERESTART) { /* Timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* We will calculate timeout by ourself. */ error = 0; } } } new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { - SDT_PROBE2(proc, kernel, , signal__clear, sig, ksi); + SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) sigexit(td, sig); } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(td, uap) struct thread *td; struct sigpending_args *uap; { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(td, uap) struct thread *td; struct osigpending_args *uap; { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(td, uap) struct thread *td; register struct osigvec_args *uap; { struct sigvec vec; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(td, uap) register struct thread *td; struct osigblock_args *uap; { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(td, uap) struct thread *td; struct osigsetmask_args *uap; { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(td, uap) struct thread *td; struct sigsuspend_args *uap; { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) has_sig += postsig(sig); mtx_unlock(&p->p_sigacts->ps_mtx); } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(td, uap) struct thread *td; struct osigsuspend_args *uap; { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(td, uap) struct thread *td; register struct osigstack_args *uap; { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(td, uap) struct thread *td; register struct sigaltstack_args *uap; { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; int err; int ret; ret = ESRCH; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid) return (ECAPMODE); AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->pid > 0) { /* kill single process */ if ((p = pfind(uap->pid)) == NULL) { if ((p = zpfind(uap->pid)) == NULL) return (ESRCH); } AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) pksignal(p, uap->signum, &ksi); PROC_UNLOCK(p); return (error); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(td, uap) struct thread *td; struct pdkill_args *uap; { struct proc *p; cap_rights_t rights; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); error = procdesc_find(td, uap->fd, cap_rights_init(&rights, CAP_PDKILL), &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (uap->pid <= 0) return (EINVAL); if ((p = pfind(uap->pid)) == NULL) { if ((p = zpfind(uap->pid)) == NULL) return (ESRCH); } error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value.sival_ptr = uap->value; error = pksignal(p, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(int pgid, int sig, ksiginfo_t *ksi) { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; int sig; int code; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(td->td_sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, prop); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; - SDT_PROBE3(proc, kernel, , signal__send, td, p, sig); + SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { - SDT_PROBE3(proc, kernel, , signal__discard, td, p, sig); + SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SA_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SA_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SA_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* * SIGKILL: Remove procfs STOPEVENTs. */ if (sig == SIGKILL) { /* from procfs_ioctl.c: PIOCBIC */ p->p_stops = 0; /* from procfs_ioctl.c: PIOCCONT */ p->p_step = 0; wakeup(&p->p_step); } /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SA_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out; } if (prop & SA_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ wakeup_swapper = 0; PROC_SLOCK(p); thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) wakeup_swapper = sleepq_abort(td, intrval); thread_unlock(td); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SA_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; register int prop; int wakeup_swapper; wakeup_swapper = 0; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SA_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SA_STOP) && (td->td_flags & TDF_SBDRY)) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sleepq_abort(td, intrval); } else { /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif } out: PROC_SUNLOCK(p); thread_unlock(td); if (wakeup_swapper) kick_proc0(); } static void sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); } else if (!TD_IS_SUSPENDED(td2)) { thread_suspend_one(td2); } } else if (!TD_IS_SUSPENDED(td2)) { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } } int ptracestop(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_dbgflags |= TDB_XSIG; td->td_xsig = sig; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (p->p_flag & P_SINGLE_EXIT) { td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (sig); } /* * Just make wait() to work, the last stopped thread * will win. */ p->p_xsig = sig; p->p_xthread = td; p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE); sig_suspend_threads(td, p, 0); if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; cv_broadcast(&p->p_dbgwait); } stopme: thread_suspend_switch(td, p); if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); while ((sig = sig_ffs(&block)) != 0) { SIGDELSET(block, sig); td = sigtd(p, sig, 0); signotify(td); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || SIGISMEMBER(ps->ps_sigcatch, sig)) tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } /* * Defer the delivery of SIGSTOP for the current thread. Returns true * if stops were deferred and false if they were already deferred. */ int sigdeferstop(void) { struct thread *td; td = curthread; if (td->td_flags & TDF_SBDRY) return (0); thread_lock(td); td->td_flags |= TDF_SBDRY; thread_unlock(td); return (1); } /* * Permit the delivery of SIGSTOP for the current thread. This does * not immediately suspend if a stop was posted. Instead, the thread * will suspend either via ast() or a subsequent interruptible sleep. */ int sigallowstop(void) { struct thread *td; int prev; td = curthread; thread_lock(td); prev = (td->td_flags & TDF_SBDRY) != 0; td->td_flags &= ~TDF_SBDRY; thread_unlock(td); return (prev); } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; sigset_t sigpending; int sig, prop, newsig; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if (p->p_flag & P_PPWAIT || td->td_flags & TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); sig = sig_ffs(&sigpending); if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); continue; } if (p->p_flag & P_TRACED && (p->p_flag & P_PPTRACE) == 0) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; td->td_dbgksi.ksi_signo = 0; if (sigqueue_get(queue, sig, &td->td_dbgksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &td->td_dbgksi); } mtx_unlock(&ps->ps_mtx); newsig = ptracestop(td, sig); mtx_lock(&ps->ps_mtx); if (sig != newsig) { /* * If parent wants us to take the signal, * then it will leave it in p->p_xsig; * otherwise we just look for signals again. */ if (newsig == 0) continue; sig = newsig; /* * Put the new signal into td_sigqueue. If the * signal is being masked, look for other * signals. */ sigqueue_add(queue, sig, NULL); if (SIGISMEMBER(td->td_sigmask, sig)) continue; signotify(td); } else { if (td->td_dbgksi.ksi_signo != 0) { td->td_dbgksi.ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, &td->td_dbgksi) != 0) td->td_dbgksi.ksi_signo = 0; } if (td->td_dbgksi.ksi_signo == 0) sigqueue_add(&td->td_sigqueue, sig, NULL); } /* * If the traced bit got turned off, go back up * to the top to rescan signals. This ensures * that p_sig* and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) continue; } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process * with default action, stop here, * then clear the signal. However, * if process is member of an orphaned * process group, ignore tty stop signals. */ if (prop & SA_STOP) { if (p->p_flag & (P_TRACED|P_WEXIT) || (p->p_pgrp->pg_jobc == 0 && prop & SA_TTYSTOP)) break; /* == ignore */ mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); break; } else if (prop & SA_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SA_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ sigqueue_delete(&p->p_sigqueue, sig); } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(sig) register int sig; { struct thread *td = curthread; register struct proc *p = td->td_proc; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig), ("postsig action")); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_code = 0; p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } /* * Kill the current process for stated reason. */ void killproc(p, why) struct proc *p; char *why; { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); p->p_flag |= P_WKILLED; kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(td, sig) struct thread *td; int sig; { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SA_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", p->p_pid, p->p_comm, td->td_ucred ? td->td_ucred->cr_uid : -1, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } /* * We only have 1 character for the core count in the format * string, so the range will be 0-9 */ #define MAX_NUM_CORES 10 static int num_cores = 5; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORES) new_val = MAX_NUM_CORES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof(int), sysctl_debug_num_cores_check, "I", ""); #define GZ_SUFFIX ".gz" #ifdef GZIO static int compress_user_cores = 1; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores, CTLFLAG_RWTUN, &compress_user_cores, 0, "Compression of user corefiles"); int compress_user_cores_gzlevel = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_gzlevel, CTLFLAG_RWTUN, &compress_user_cores_gzlevel, 0, "Corefile gzip compression level"); #else static int compress_user_cores = 0; #endif /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, struct vnode **vpp, char **namep) { struct nameidata nd; struct sbuf sb; const char *format; char *hostname, *name; int indexpos, i, error, cmode, flags, oflags; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexpos = -1; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; case 'I': /* autoincrementing index */ sbuf_printf(&sb, "0"); indexpos = sbuf_len(&sb) - 1; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress) sbuf_printf(&sb, GZ_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); /* * If the core format has a %I in it, then we need to check * for existing corefiles before returning a name. * To do this we iterate over 0..num_cores to find a * non-existing core file name to use. */ if (indexpos != -1) { for (i = 0; i < num_cores; i++) { flags = O_CREAT | O_EXCL | FWRITE | O_NOFOLLOW; name[indexpos] = '0' + i; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error) { if (error == EEXIST) continue; log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } goto out; } } flags = O_CREAT | FWRITE | O_NOFOLLOW; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); out: if (error) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); *vpp = nd.ni_vp; *namep = name; return (0); } static int coredump_sanitise_path(const char *path) { size_t i; /* * Only send a subset of ASCII to devd(8) because it * might pass these strings to sh -c. */ for (i = 0; path[i]; i++) if (!(isalpha(path[i]) || isdigit(path[i])) && path[i] != '/' && path[i] != '.' && path[i] != '-') return (0); return (1); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *data = NULL; char *fullpath, *freepath = NULL; size_t len; static const char comm_name[] = "comm="; static const char core_name[] = "core="; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0) { VOP_UNLOCK(vp, 0); error = EFAULT; goto out; } VOP_UNLOCK(vp, 0); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, compress_user_cores ? IMGACT_CORE_COMPRESS : 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; len = MAXPATHLEN * 2 + sizeof(comm_name) - 1 + sizeof(' ') + sizeof(core_name) - 1; data = malloc(len, M_TEMP, M_WAITOK); if (vn_fullpath_global(td, p->p_textvp, &fullpath, &freepath) != 0) goto out; if (!coredump_sanitise_path(fullpath)) goto out; snprintf(data, len, "%s%s ", comm_name, fullpath); free(freepath, M_TEMP); freepath = NULL; if (vn_fullpath_global(td, vp, &fullpath, &freepath) != 0) goto out; if (!coredump_sanitise_path(fullpath)) goto out; strlcat(data, core_name, len); strlcat(data, fullpath, len); devctl_notify("kernel", "signal", "coredump", data); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(freepath, M_TEMP); free(data, M_TEMP); free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(td, args) struct thread *td; struct nosys_args *args; { struct proc *p = td->td_proc; PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(sigiop, sig, checkctty) struct sigio **sigiop; int sig, checkctty; { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(&p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(&p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } Index: head/sys/kern/kern_timeout.c =================================================================== --- head/sys/kern/kern_timeout.c (revision 292383) +++ head/sys/kern/kern_timeout.c (revision 292384) @@ -1,1619 +1,1617 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_callout_profiling.h" #if defined(__arm__) #include "opt_timer.h" #endif #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifndef NO_EVENTTIMERS DPCPU_DECLARE(sbintime_t, hardclocktime); #endif SDT_PROVIDER_DEFINE(callout_execute); -SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__start, - "struct callout *"); -SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__end, - "struct callout *"); +SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *"); +SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *"); #ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); static int avg_gcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0, "Average number of Giant callouts made per softclock call. Units = 1/1000"); static int avg_lockcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, "Average number of lock callouts made per softclock call. Units = 1/1000"); static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); static int avg_depth_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, "Average number of direct callouts examined per callout_process call. " "Units = 1/1000"); static int avg_lockcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " "callout_process call. Units = 1/1000"); static int avg_mpcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, 0, "Average number of MP direct callouts made per callout_process call. " "Units = 1/1000"); #endif static int ncallout; SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0, "Number of entries in callwheel and size of timeout() preallocation"); #ifdef RSS static int pin_default_swi = 1; static int pin_pcpu_swi = 1; #else static int pin_default_swi = 0; static int pin_pcpu_swi = 0; #endif SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi, 0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)"); SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi, 0, "Pin the per-CPU swis (except PCPU 0, which is also default"); /* * TODO: * allocate more timeout table slots when table overflows. */ u_int callwheelsize, callwheelmask; /* * The callout cpu exec entities represent informations necessary for * describing the state of callouts currently running on the CPU and the ones * necessary for migrating callouts to the new callout cpu. In particular, * the first entry of the array cc_exec_entity holds informations for callout * running in SWI thread context, while the second one holds informations * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ struct cc_exec { struct callout *cc_curr; void (*cc_drain)(void *); #ifdef SMP void (*ce_migration_func)(void *); void *ce_migration_arg; int ce_migration_cpu; sbintime_t ce_migration_time; sbintime_t ce_migration_prec; #endif bool cc_cancel; bool cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. */ struct callout_cpu { struct mtx_padalign cc_lock; struct cc_exec cc_exec_entity[2]; struct callout *cc_next; struct callout *cc_callout; struct callout_list *cc_callwheel; struct callout_tailq cc_expireq; struct callout_slist cc_callfree; sbintime_t cc_firstevent; sbintime_t cc_lastscan; void *cc_cookie; u_int cc_bucket; u_int cc_inited; char cc_ktr_event_name[20]; }; #define callout_migrating(c) ((c)->c_iflags & CALLOUT_DFRMIGRATION) #define cc_exec_curr(cc, dir) cc->cc_exec_entity[dir].cc_curr #define cc_exec_drain(cc, dir) cc->cc_exec_entity[dir].cc_drain #define cc_exec_next(cc) cc->cc_next #define cc_exec_cancel(cc, dir) cc->cc_exec_entity[dir].cc_cancel #define cc_exec_waiting(cc, dir) cc->cc_exec_entity[dir].cc_waiting #ifdef SMP #define cc_migration_func(cc, dir) cc->cc_exec_entity[dir].ce_migration_func #define cc_migration_arg(cc, dir) cc->cc_exec_entity[dir].ce_migration_arg #define cc_migration_cpu(cc, dir) cc->cc_exec_entity[dir].ce_migration_cpu #define cc_migration_time(cc, dir) cc->cc_exec_entity[dir].ce_migration_time #define cc_migration_prec(cc, dir) cc->cc_exec_entity[dir].ce_migration_prec struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) #define CC_SELF() CC_CPU(PCPU_GET(cpuid)) #else struct callout_cpu cc_cpu; #define CC_CPU(cpu) &cc_cpu #define CC_SELF() &cc_cpu #endif #define CC_LOCK(cc) mtx_lock_spin(&(cc)->cc_lock) #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) static int timeout_cpu; static void callout_cpu_init(struct callout_cpu *cc, int cpu); static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: * cc_curr - If a callout is in progress, it is cc_curr. * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when * cc_curr is non-NULL. */ /* * Resets the execution entity tied to a specific callout cpu. */ static void cc_cce_cleanup(struct callout_cpu *cc, int direct) { cc_exec_curr(cc, direct) = NULL; cc_exec_cancel(cc, direct) = false; cc_exec_waiting(cc, direct) = false; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } /* * Checks if migration is requested by a specific callout cpu. */ static int cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP return (cc_migration_cpu(cc, direct) != CPUBLOCK); #else return (0); #endif } /* * Kernel low level callwheel initialization * called on cpu0 during kernel startup. */ static void callout_callwheel_init(void *dummy) { struct callout_cpu *cc; /* * Calculate the size of the callout wheel and the preallocated * timeout() structures. * XXX: Clip callout to result of previous function of maxusers * maximum 384. This is still huge, but acceptable. */ memset(CC_CPU(0), 0, sizeof(cc_cpu)); ncallout = imin(16 + maxproc + maxfiles, 18508); TUNABLE_INT_FETCH("kern.ncallout", &ncallout); /* * Calculate callout wheel size, should be next power of two higher * than 'ncallout'. */ callwheelsize = 1 << fls(ncallout); callwheelmask = callwheelsize - 1; /* * Fetch whether we're pinning the swi's or not. */ TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi); TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi); /* * Only cpu0 handles timeout(9) and receives a preallocation. * * XXX: Once all timeout(9) consumers are converted this can * be removed. */ timeout_cpu = PCPU_GET(cpuid); cc = CC_CPU(timeout_cpu); cc->cc_callout = malloc(ncallout * sizeof(struct callout), M_CALLOUT, M_WAITOK); callout_cpu_init(cc, timeout_cpu); } SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL); /* * Initialize the per-cpu callout structures. */ static void callout_cpu_init(struct callout_cpu *cc, int cpu) { struct callout *c; int i; mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); cc->cc_inited = 1; cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize, M_CALLOUT, M_WAITOK); for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; for (i = 0; i < 2; i++) cc_cce_cleanup(cc, i); snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); if (cc->cc_callout == NULL) /* Only cpu0 handles timeout(9) */ return; for (i = 0; i < ncallout; i++) { c = &cc->cc_callout[i]; callout_init(c, 0); c->c_iflags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } } #ifdef SMP /* * Switches the cpu tied to a specific callout. * The function expects a locked incoming callout cpu and returns with * locked outcoming callout cpu. */ static struct callout_cpu * callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) { struct callout_cpu *new_cc; MPASS(c != NULL && cc != NULL); CC_LOCK_ASSERT(cc); /* * Avoid interrupts and preemption firing after the callout cpu * is blocked in order to avoid deadlocks as the new thread * may be willing to acquire the callout cpu lock. */ c->c_cpu = CPUBLOCK; spinlock_enter(); CC_UNLOCK(cc); new_cc = CC_CPU(new_cpu); CC_LOCK(new_cc); spinlock_exit(); c->c_cpu = new_cpu; return (new_cc); } #endif /* * Start standard softclock thread. */ static void start_softclock(void *dummy) { struct callout_cpu *cc; char name[MAXCOMLEN]; #ifdef SMP int cpu; struct intr_event *ie; #endif cc = CC_CPU(timeout_cpu); snprintf(name, sizeof(name), "clock (%d)", timeout_cpu); if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); if (pin_default_swi && (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) { printf("%s: timeout clock couldn't be pinned to cpu %d\n", __func__, timeout_cpu); } #ifdef SMP CPU_FOREACH(cpu) { if (cpu == timeout_cpu) continue; cc = CC_CPU(cpu); cc->cc_callout = NULL; /* Only cpu0 handles timeout(9). */ callout_cpu_init(cc, cpu); snprintf(name, sizeof(name), "clock (%d)", cpu); ie = NULL; if (swi_add(&ie, name, softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) { printf("%s: per-cpu clock couldn't be pinned to " "cpu %d\n", __func__, cpu); } } #endif } SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); #define CC_HASH_SHIFT 8 static inline u_int callout_hash(sbintime_t sbt) { return (sbt >> (32 - CC_HASH_SHIFT)); } static inline u_int callout_get_bucket(sbintime_t sbt) { return (callout_hash(sbt) & callwheelmask); } void callout_process(sbintime_t now) { struct callout *tmp, *tmpn; struct callout_cpu *cc; struct callout_list *sc; sbintime_t first, last, max, tmp_max; uint32_t lookahead; u_int firstb, lastb, nowb; #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); cc->cc_lastscan = now; nowb = callout_hash(now); /* Compute the last bucket and minimum time of the bucket after it. */ if (nowb == firstb) lookahead = (SBT_1S / 16); else if (nowb - firstb == 1) lookahead = (SBT_1S / 8); else lookahead = (SBT_1S / 2); first = last = now; first += (lookahead / 2); last += lookahead; last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); lastb = callout_hash(last) - 1; max = last; /* * Check if we wrapped around the entire wheel from the last scan. * In case, we need to scan entirely the wheel for pending callouts. */ if (lastb - firstb >= callwheelsize) { lastb = firstb + callwheelsize - 1; if (nowb - firstb >= callwheelsize) nowb = lastb; } /* Iterate callwheel from firstb to nowb and then up to lastb. */ do { sc = &cc->cc_callwheel[firstb & callwheelmask]; tmp = LIST_FIRST(sc); while (tmp != NULL) { /* Run the callout if present time within allowed. */ if (tmp->c_time <= now) { /* * Consumer told us the callout may be run * directly from hardware interrupt context. */ if (tmp->c_iflags & CALLOUT_DIRECT) { #ifdef CALLOUT_PROFILING ++depth_dir; #endif cc_exec_next(cc) = LIST_NEXT(tmp, c_links.le); cc->cc_bucket = firstb & callwheelmask; LIST_REMOVE(tmp, c_links.le); softclock_call_cc(tmp, cc, #ifdef CALLOUT_PROFILING &mpcalls_dir, &lockcalls_dir, NULL, #endif 1); tmp = cc_exec_next(cc); cc_exec_next(cc) = NULL; } else { tmpn = LIST_NEXT(tmp, c_links.le); LIST_REMOVE(tmp, c_links.le); TAILQ_INSERT_TAIL(&cc->cc_expireq, tmp, c_links.tqe); tmp->c_iflags |= CALLOUT_PROCESSED; tmp = tmpn; } continue; } /* Skip events from distant future. */ if (tmp->c_time >= max) goto next; /* * Event minimal time is bigger than present maximal * time, so it cannot be aggregated. */ if (tmp->c_time > last) { lastb = nowb; goto next; } /* Update first and last time, respecting this event. */ if (tmp->c_time < first) first = tmp->c_time; tmp_max = tmp->c_time + tmp->c_precision; if (tmp_max < last) last = tmp_max; next: tmp = LIST_NEXT(tmp, c_links.le); } /* Proceed with the next bucket. */ firstb++; /* * Stop if we looked after present time and found * some event we can't execute at now. * Stop if we looked far enough into the future. */ } while (((int)(firstb - lastb)) <= 0); cc->cc_firstevent = last; #ifndef NO_EVENTTIMERS cpu_new_callout(curcpu, last, first); #endif #ifdef CALLOUT_PROFILING avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; #endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. */ if (!TAILQ_EMPTY(&cc->cc_expireq)) swi_sched(cc->cc_cookie, 0); } static struct callout_cpu * callout_lock(struct callout *c) { struct callout_cpu *cc; int cpu; for (;;) { cpu = c->c_cpu; #ifdef SMP if (cpu == CPUBLOCK) { while (c->c_cpu == CPUBLOCK) cpu_spinwait(); continue; } #endif cc = CC_CPU(cpu); CC_LOCK(cc); if (cpu == c->c_cpu) break; CC_UNLOCK(cc); } return (cc); } static void callout_cc_add(struct callout *c, struct callout_cpu *cc, sbintime_t sbt, sbintime_t precision, void (*func)(void *), void *arg, int cpu, int flags) { int bucket; CC_LOCK_ASSERT(cc); if (sbt < cc->cc_lastscan) sbt = cc->cc_lastscan; c->c_arg = arg; c->c_iflags |= CALLOUT_PENDING; c->c_iflags &= ~CALLOUT_PROCESSED; c->c_flags |= CALLOUT_ACTIVE; if (flags & C_DIRECT_EXEC) c->c_iflags |= CALLOUT_DIRECT; c->c_func = func; c->c_time = sbt; c->c_precision = precision; bucket = callout_get_bucket(c->c_time); CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); if (cc->cc_bucket == bucket) cc_exec_next(cc) = c; #ifndef NO_EVENTTIMERS /* * Inform the eventtimers(4) subsystem there's a new callout * that has been inserted, but only if really required. */ if (SBT_MAX - c->c_time < c->c_precision) c->c_precision = SBT_MAX - c->c_time; sbt = c->c_time + c->c_precision; if (sbt < cc->cc_firstevent) { cc->cc_firstevent = sbt; cpu_new_callout(cpu, sbt, c->c_time); } #endif } static void callout_cc_del(struct callout *c, struct callout_cpu *cc) { if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0) return; c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct) { struct rm_priotracker tracker; void (*c_func)(void *); void *c_arg; struct lock_class *class; struct lock_object *c_lock; uintptr_t lock_status; int c_iflags; #ifdef SMP struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; int flags, new_cpu; sbintime_t new_prec, new_time; #endif #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbintime_t sbt1, sbt2; struct timespec ts2; static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static timeout_t *lastfunc; #endif KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING, ("softclock_call_cc: pend %p %x", c, c->c_iflags)); KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE, ("softclock_call_cc: act %p %x", c, c->c_flags)); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; lock_status = 0; if (c->c_flags & CALLOUT_SHAREDLOCK) { if (class == &lock_class_rm) lock_status = (uintptr_t)&tracker; else lock_status = 1; } c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_iflags = c->c_iflags; if (c->c_iflags & CALLOUT_LOCAL_ALLOC) c->c_iflags = CALLOUT_LOCAL_ALLOC; else c->c_iflags &= ~CALLOUT_PENDING; cc_exec_curr(cc, direct) = c; cc_exec_cancel(cc, direct) = false; cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, lock_status); /* * The callout may have been cancelled * while we switched locks. */ if (cc_exec_cancel(cc, direct)) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ cc_exec_cancel(cc, direct) = true; if (c_lock == &Giant.lock_object) { #ifdef CALLOUT_PROFILING (*gcalls)++; #endif CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { #ifdef CALLOUT_PROFILING (*lockcalls)++; #endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { #ifdef CALLOUT_PROFILING (*mpcalls)++; #endif CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running", "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); - SDT_PROBE1(callout_execute, kernel, , callout__start, c); + SDT_PROBE1(callout_execute, , , callout__start, c); c_func(c_arg); - SDT_PROBE1(callout_execute, kernel, , callout__end, c); + SDT_PROBE1(callout_execute, , , callout__end, c); THREAD_SLEEPING_OK(); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt2 = sbinuptime(); sbt2 -= sbt1; if (sbt2 > maxdt) { if (lastfunc != c_func || sbt2 > maxdt * 2) { ts2 = sbttots(sbt2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } maxdt = sbt2; lastfunc = c_func; } #endif KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle"); CTR1(KTR_CALLOUT, "callout %p finished", c); if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0) class->lc_unlock(c_lock); skip: CC_LOCK(cc); KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr")); cc_exec_curr(cc, direct) = NULL; if (cc_exec_drain(cc, direct)) { void (*drain)(void *); drain = cc_exec_drain(cc, direct); cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); drain(c_arg); CC_LOCK(cc); } if (cc_exec_waiting(cc, direct)) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ if (cc_cce_migrating(cc, direct)) { cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not * destroyed but that is not easy. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; } cc_exec_waiting(cc, direct) = false; CC_UNLOCK(cc); wakeup(&cc_exec_waiting(cc, direct)); CC_LOCK(cc); } else if (cc_cce_migrating(cc, direct)) { KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP /* * If the callout was scheduled for * migration just perform it now. */ new_cpu = cc_migration_cpu(cc, direct); new_time = cc_migration_time(cc, direct); new_prec = cc_migration_prec(cc, direct); new_func = cc_migration_func(cc, direct); new_arg = cc_migration_arg(cc, direct); cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed * but that is not easy. * * As first thing, handle deferred callout stops. */ if (!callout_migrating(c)) { CTR3(KTR_CALLOUT, "deferred cancelled %p func %p arg %p", c, new_func, new_arg); callout_cc_del(c, cc); return; } c->c_iflags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); flags = (direct) ? C_DIRECT_EXEC : 0; callout_cc_add(c, new_cc, new_time, new_prec, new_func, new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else panic("migration should not happen"); #endif } /* * If the current callout is locally allocated (from * timeout(9)) then put it on the freelist. * * Note: we need to check the cached copy of c_iflags because * if it was not local, then it's not safe to deref the * callout pointer. */ KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 || c->c_iflags == CALLOUT_LOCAL_ALLOC, ("corrupted callout")); if (c_iflags & CALLOUT_LOCAL_ALLOC) callout_cc_del(c, cc); } /* * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning * the BSD Callout and Timer Facilities" and modified slightly for inclusion * in FreeBSD by Justin T. Gibbs. The original work on the data structures * used in this implementation was published by G. Varghese and T. Lauck in * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for * the Efficient Implementation of a Timer Facility" in the Proceedings of * the 11th ACM Annual Symposium on Operating Systems Principles, * Austin, Texas Nov 1987. */ /* * Software (low priority) clock interrupt. * Run periodic events from timeout queue. */ void softclock(void *arg) { struct callout_cpu *cc; struct callout *c; #ifdef CALLOUT_PROFILING int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0; #endif cc = (struct callout_cpu *)arg; CC_LOCK(cc); while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); softclock_call_cc(c, cc, #ifdef CALLOUT_PROFILING &mpcalls, &lockcalls, &gcalls, #endif 0); #ifdef CALLOUT_PROFILING ++depth; #endif } #ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; #endif CC_UNLOCK(cc); } /* * timeout -- * Execute a function after a specified length of time. * * untimeout -- * Cancel previous timeout function call. * * callout_handle_init -- * Initialize a handle so that using it with untimeout is benign. * * See AT&T BCI Driver Reference Manual for specification. This * implementation differs from that one in that although an * identification value is returned from timeout, the original * arguments to timeout as well as the identifier are used to * identify entries for untimeout. */ struct callout_handle timeout(timeout_t *ftn, void *arg, int to_ticks) { struct callout_cpu *cc; struct callout *new; struct callout_handle handle; cc = CC_CPU(timeout_cpu); CC_LOCK(cc); /* Fill in the next free callout structure. */ new = SLIST_FIRST(&cc->cc_callfree); if (new == NULL) /* XXX Attempt to malloc first */ panic("timeout table full"); SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle); callout_reset(new, to_ticks, ftn, arg); handle.callout = new; CC_UNLOCK(cc); return (handle); } void untimeout(timeout_t *ftn, void *arg, struct callout_handle handle) { struct callout_cpu *cc; /* * Check for a handle that was initialized * by callout_handle_init, but never used * for a real timeout. */ if (handle.callout == NULL) return; cc = callout_lock(handle.callout); if (handle.callout->c_func == ftn && handle.callout->c_arg == arg) callout_stop(handle.callout); CC_UNLOCK(cc); } void callout_handle_init(struct callout_handle *handle) { handle->callout = NULL; } /* * New interface; clients allocate their own callout structures. * * callout_reset() - establish or change a timeout * callout_stop() - disestablish a timeout * callout_init() - initialize a callout structure so that it can * safely be passed to callout_reset() and callout_stop() * * defines three convenience macros: * * callout_active() - returns truth if callout has not been stopped, * drained, or deactivated since the last time the callout was * reset. * callout_pending() - returns truth if callout is still waiting for timeout * callout_deactivate() - marks the callout as having been serviced */ int callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, void (*ftn)(void *), void *arg, int cpu, int flags) { sbintime_t to_sbt, pr; struct callout_cpu *cc; int cancelled, direct; int ignore_cpu=0; cancelled = 0; if (cpu == -1) { ignore_cpu = 1; } else if ((cpu >= MAXCPU) || ((CC_CPU(cpu))->cc_inited == 0)) { /* Invalid CPU spec */ panic("Invalid CPU in callout %d", cpu); } if (flags & C_ABSOLUTE) { to_sbt = sbt; } else { if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) sbt = tick_sbt; if ((flags & C_HARDCLOCK) || #ifdef NO_EVENTTIMERS sbt >= sbt_timethreshold) { to_sbt = getsbinuptime(); /* Add safety belt for the case of hz > 1000. */ to_sbt += tc_tick_sbt - tick_sbt; #else sbt >= sbt_tickthreshold) { /* * Obtain the time of the last hardclock() call on * this CPU directly from the kern_clocksource.c. * This value is per-CPU, but it is equal for all * active ones. */ #ifdef __LP64__ to_sbt = DPCPU_GET(hardclocktime); #else spinlock_enter(); to_sbt = DPCPU_GET(hardclocktime); spinlock_exit(); #endif #endif if ((flags & C_HARDCLOCK) == 0) to_sbt += tick_sbt; } else to_sbt = sbinuptime(); if (SBT_MAX - to_sbt < sbt) to_sbt = SBT_MAX; else to_sbt += sbt; pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : sbt >> C_PRELGET(flags)); if (pr > precision) precision = pr; } /* * This flag used to be added by callout_cc_add, but the * first time you call this we could end up with the * wrong direct flag if we don't do it before we add. */ if (flags & C_DIRECT_EXEC) { direct = 1; } else { direct = 0; } KASSERT(!direct || c->c_lock == NULL, ("%s: direct callout %p has lock", __func__, c)); cc = callout_lock(c); /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced or handle the case where the user does * not care. */ if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) || ignore_cpu) { cpu = c->c_cpu; } if (cc_exec_curr(cc, direct) == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ if (c->c_lock != NULL && !cc_exec_cancel(cc, direct)) cancelled = cc_exec_cancel(cc, direct) = true; if (cc_exec_waiting(cc, direct)) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. */ CTR4(KTR_CALLOUT, "%s %p func %p arg %p", cancelled ? "cancelled" : "failed to cancel", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (cancelled); } #ifdef SMP if (callout_migrating(c)) { /* * This only occurs when a second callout_reset_sbt_on * is made after a previous one moved it into * deferred migration (below). Note we do *not* change * the prev_cpu even though the previous target may * be different. */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; cancelled = 1; CC_UNLOCK(cc); return (cancelled); } #endif } if (c->c_iflags & CALLOUT_PENDING) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } cancelled = 1; c->c_iflags &= ~ CALLOUT_PENDING; c->c_flags &= ~ CALLOUT_ACTIVE; } #ifdef SMP /* * If the callout must migrate try to perform it immediately. * If the callout is currently running, just defer the migration * to a more appropriate moment. */ if (c->c_cpu != cpu) { if (cc_exec_curr(cc, direct) == c) { /* * Pending will have been removed since we are * actually executing the callout on another * CPU. That callout should be waiting on the * lock the caller holds. If we set both * active/and/pending after we return and the * lock on the executing callout proceeds, it * will then see pending is true and return. * At the return from the actual callout execution * the migration will occur in softclock_call_cc * and this new callout will be placed on the * new CPU via a call to callout_cpu_switch() which * will get the lock on the right CPU followed * by a call callout_cc_add() which will add it there. * (see above in softclock_call_cc()). */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING); c->c_flags |= CALLOUT_ACTIVE; CTR6(KTR_CALLOUT, "migration of %p func %p arg %p in %d.%08x to %u deferred", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } cc = callout_cpu_switch(c, cc, cpu); } #endif callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); } /* * Common idioms that can be optimized in the future. */ int callout_schedule_on(struct callout *c, int to_ticks, int cpu) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu); } int callout_schedule(struct callout *c, int to_ticks) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu); } int _callout_stop_safe(struct callout *c, int safe, void (*drain)(void *)) { struct callout_cpu *cc, *old_cc; struct lock_class *class; int direct, sq_locked, use_lock; int not_on_a_list; if (safe) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock, "calling %s", __func__); /* * Some old subsystems don't hold Giant while running a callout_stop(), * so just discard this check for the moment. */ if (!safe && c->c_lock != NULL) { if (c->c_lock == &Giant.lock_object) use_lock = mtx_owned(&Giant); else { use_lock = 1; class = LOCK_CLASS(c->c_lock); class->lc_assert(c->c_lock, LA_XLOCKED); } } else use_lock = 0; if (c->c_iflags & CALLOUT_DIRECT) { direct = 1; } else { direct = 0; } sq_locked = 0; old_cc = NULL; again: cc = callout_lock(c); if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) == (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) && ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) { /* * Special case where this slipped in while we * were migrating *as* the callout is about to * execute. The caller probably holds the lock * the callout wants. * * Get rid of the migration first. Then set * the flag that tells this code *not* to * try to remove it from any lists (its not * on one yet). When the callout wheel runs, * it will ignore this callout. */ c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; not_on_a_list = 1; } else { not_on_a_list = 0; } /* * If the callout was migrating while the callout cpu lock was * dropped, just drop the sleepqueue lock and check the states * again. */ if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); sleepq_release(&cc_exec_waiting(old_cc, direct)); sq_locked = 0; old_cc = NULL; goto again; #else panic("migration should not happen"); #endif } /* * If the callout isn't pending, it's not on the queue, so * don't attempt to remove it from the queue. We can try to * stop it by other means however. */ if (!(c->c_iflags & CALLOUT_PENDING)) { /* * If it wasn't on the queue and it isn't the current * callout, then we can't stop it, so just bail. * It probably has already been run (if locking * is properly done). You could get here if the caller * calls stop twice in a row for example. The second * call would fall here without CALLOUT_ACTIVE set. */ c->c_flags &= ~CALLOUT_ACTIVE; if (cc_exec_curr(cc, direct) != c) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); if (sq_locked) sleepq_release(&cc_exec_waiting(cc, direct)); return (-1); } if (safe) { /* * The current callout is running (or just * about to run) and blocking is allowed, so * just wait for the current invocation to * finish. */ while (cc_exec_curr(cc, direct) == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid * a LOR between cc_lock and sleepqueue * chain spinlocks. This piece of code * emulates a msleep_spin() call actually. * * If we already have the sleepqueue chain * locked, then we can safely block. If we * don't already have it locked, however, * we have to drop the cc_lock to lock * it. This opens several races, so we * restart at the beginning once we have * both locks. If nothing has changed, then * we will end up back here with sq_locked * set. */ if (!sq_locked) { CC_UNLOCK(cc); sleepq_lock( &cc_exec_waiting(cc, direct)); sq_locked = 1; old_cc = cc; goto again; } /* * Migration could be cancelled here, but * as long as it is still not sure when it * will be packed up, just let softclock() * take care of it. */ cc_exec_waiting(cc, direct) = true; DROP_GIANT(); CC_UNLOCK(cc); sleepq_add( &cc_exec_waiting(cc, direct), &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); sleepq_wait( &cc_exec_waiting(cc, direct), 0); sq_locked = 0; old_cc = NULL; /* Reacquire locks previously released. */ PICKUP_GIANT(); CC_LOCK(cc); } } else if (use_lock && !cc_exec_cancel(cc, direct) && (drain == NULL)) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout * and return. After our caller drops the * lock, the callout will be skipped in * softclock(). This *only* works with a * callout_stop() *not* callout_drain() or * callout_async_drain(). */ cc_exec_cancel(cc, direct) = true; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); if (callout_migrating(c)) { c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); return (1); } else if (callout_migrating(c)) { /* * The callout is currently being serviced * and the "next" callout is scheduled at * its completion with a migration. We remove * the migration flag so it *won't* get rescheduled, * but we can't stop the one thats running so * we return 0. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP /* * We can't call cc_cce_cleanup here since * if we do it will remove .ce_curr and * its still running. This will prevent a * reschedule of the callout when the * execution completes. */ cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { cc_exec_drain(cc, direct) = drain; } CC_UNLOCK(cc); return (0); } CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { cc_exec_drain(cc, direct) = drain; } CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain still locked")); return (0); } if (sq_locked) sleepq_release(&cc_exec_waiting(cc, direct)); c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); if (not_on_a_list == 0) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } } callout_cc_del(c, cc); CC_UNLOCK(cc); return (1); } void callout_init(struct callout *c, int mpsafe) { bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; c->c_iflags = CALLOUT_RETURNUNLOCKED; } else { c->c_lock = &Giant.lock_object; c->c_iflags = 0; } c->c_cpu = timeout_cpu; } void _callout_init_lock(struct callout *c, struct lock_object *lock, int flags) { bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, ("callout_init_lock: bad flags %d", flags)); KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class", __func__)); c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); c->c_cpu = timeout_cpu; } #ifdef APM_FIXUP_CALLTODO /* * Adjust the kernel calltodo timeout list. This routine is used after * an APM resume to recalculate the calltodo timer list values with the * number of hz's we have been sleeping. The next hardclock() will detect * that there are fired timers and run softclock() to execute them. * * Please note, I have not done an exhaustive analysis of what code this * might break. I am motivated to have my select()'s and alarm()'s that * have expired during suspend firing upon resume so that the applications * which set the timer can do the maintanence the timer was for as close * as possible to the originally intended time. Testing this code for a * week showed that resuming from a suspend resulted in 22 to 25 timers * firing, which seemed independant on whether the suspend was 2 hours or * 2 days. Your milage may vary. - Ken Key */ void adjust_timeout_calltodo(struct timeval *time_change) { register struct callout *p; unsigned long delta_ticks; /* * How many ticks were we asleep? * (stolen from tvtohz()). */ /* Don't do anything */ if (time_change->tv_sec < 0) return; else if (time_change->tv_sec <= LONG_MAX / 1000000) delta_ticks = (time_change->tv_sec * 1000000 + time_change->tv_usec + (tick - 1)) / tick + 1; else if (time_change->tv_sec <= LONG_MAX / hz) delta_ticks = time_change->tv_sec * hz + (time_change->tv_usec + (tick - 1)) / tick + 1; else delta_ticks = LONG_MAX; if (delta_ticks > INT_MAX) delta_ticks = INT_MAX; /* * Now rip through the timer calltodo list looking for timers * to expire. */ /* don't collide with softclock() */ CC_LOCK(cc); for (p = calltodo.c_next; p != NULL; p = p->c_next) { p->c_time -= delta_ticks; /* Break if the timer had more time on it than delta_ticks */ if (p->c_time > 0) break; /* take back the ticks the timer didn't use (p->c_time <= 0) */ delta_ticks = -p->c_time; } CC_UNLOCK(cc); return; } #endif /* APM_FIXUP_CALLTODO */ static int flssbt(sbintime_t sbt) { sbt += (uint64_t)sbt >> 1; if (sizeof(long) >= sizeof(sbintime_t)) return (flsl(sbt)); if (sbt >= SBT_1S) return (flsl(((uint64_t)sbt) >> 32) + 32); return (flsl(sbt)); } /* * Dump immediate statistic snapshot of the scheduled callouts. */ static int sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) { struct callout *tmp; struct callout_cpu *cc; struct callout_list *sc; sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; int ct[64], cpr[64], ccpbk[32]; int error, val, i, count, tcum, pcum, maxc, c, medc; #ifdef SMP int cpu; #endif val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); count = maxc = 0; st = spr = maxt = maxpr = 0; bzero(ccpbk, sizeof(ccpbk)); bzero(ct, sizeof(ct)); bzero(cpr, sizeof(cpr)); now = sbinuptime(); #ifdef SMP CPU_FOREACH(cpu) { cc = CC_CPU(cpu); #else cc = CC_CPU(timeout_cpu); #endif CC_LOCK(cc); for (i = 0; i < callwheelsize; i++) { sc = &cc->cc_callwheel[i]; c = 0; LIST_FOREACH(tmp, sc, c_links.le) { c++; t = tmp->c_time - now; if (t < 0) t = 0; st += t / SBT_1US; spr += tmp->c_precision / SBT_1US; if (t > maxt) maxt = t; if (tmp->c_precision > maxpr) maxpr = tmp->c_precision; ct[flssbt(t)]++; cpr[flssbt(tmp->c_precision)]++; } if (c > maxc) maxc = c; ccpbk[fls(c + c / 2)]++; count += c; } CC_UNLOCK(cc); #ifdef SMP } #endif for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) tcum += ct[i]; medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) pcum += cpr[i]; medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, c = 0; i < 32 && c < count / 2; i++) c += ccpbk[i]; medc = (i >= 2) ? (1 << (i - 2)) : 0; printf("Scheduled callouts statistic snapshot:\n"); printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", medc, count / callwheelsize / mp_ncpus, (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, maxc); printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, (st / count) / 1000000, (st / count) % 1000000, maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, (spr / count) / 1000000, (spr / count) % 1000000, maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); printf(" Distribution: \tbuckets\t time\t tcum\t" " prec\t pcum\n"); for (i = 0, tcum = pcum = 0; i < 64; i++) { if (ct[i] == 0 && cpr[i] == 0) continue; t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; tcum += ct[i]; pcum += cpr[i]; printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, i - 1 - (32 - CC_HASH_SHIFT), ct[i], tcum, cpr[i], pcum); } return (error); } SYSCTL_PROC(_kern, OID_AUTO, callout_stat, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_callout_stat, "I", "Dump immediate statistic snapshot of the scheduled callouts"); Index: head/sys/kern/vfs_cache.c =================================================================== --- head/sys/kern/vfs_cache.c (revision 292383) +++ head/sys/kern/vfs_cache.c (revision 292384) @@ -1,1520 +1,1518 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct namecache { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. */ struct namecache_ts { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ char nc_name[0]; /* segment name + nul */ }; /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. */ /* * Structures associated with name cacheing. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ static u_long nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long ncnegfactor = 16; /* ratio of negative entries */ SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); static u_long numneg; /* number of negative entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "Number of negative entries in namecache"); static u_long numcache; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "Number of namecache entries"); static u_long numcachehv; /* number of cache entries with vnodes held */ SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "Number of namecache entries with vnodes held"); static u_int ncsizefactor = 2; SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); struct nchstats nchstats; /* cache effectiveness statistics */ static struct rwlock cache_lock; RW_SYSINIT(vfscache, &cache_lock, "Name Cache"); #define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock) #define CACHE_RLOCK() rw_rlock(&cache_lock) #define CACHE_RUNLOCK() rw_runlock(&cache_lock) #define CACHE_WLOCK() rw_wlock(&cache_lock) #define CACHE_WUNLOCK() rw_wunlock(&cache_lock) /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t cache_zone_small; static uma_zone_t cache_zone_small_ts; static uma_zone_t cache_zone_large; static uma_zone_t cache_zone_large_ts; #define CACHE_PATH_CUTOFF 35 static struct namecache * cache_alloc(int len, int ts) { if (len > CACHE_PATH_CUTOFF) { if (ts) return (uma_zalloc(cache_zone_large_ts, M_WAITOK)); else return (uma_zalloc(cache_zone_large, M_WAITOK)); } if (ts) return (uma_zalloc(cache_zone_small_ts, M_WAITOK)); else return (uma_zalloc(cache_zone_small, M_WAITOK)); } static void cache_free(struct namecache *ncp) { int ts; if (ncp == NULL) return; ts = ncp->nc_flag & NCF_TS; if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) { if (ts) uma_zfree(cache_zone_small_ts, ncp); else uma_zfree(cache_zone_small, ncp); } else if (ts) uma_zfree(cache_zone_large_ts, ncp); else uma_zfree(cache_zone_large, ncp); } static char * nc_get_name(struct namecache *ncp) { struct namecache_ts *ncp_ts; if ((ncp->nc_flag & NCF_TS) == 0) return (ncp->nc_name); ncp_ts = (struct namecache_ts *)ncp; return (ncp_ts->nc_name); } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp != NULL) *tsp = ((struct namecache_ts *)ncp)->nc_time; if (ticksp != NULL) *ticksp = ((struct namecache_ts *)ncp)->nc_ticks; } static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); #define STATNODE(mode, name, var, descr) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, descr); STATNODE(CTLFLAG_RD, numneg, &numneg, "Number of negative cache entries"); STATNODE(CTLFLAG_RD, numcache, &numcache, "Number of cache entries"); static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls, "Number of cache lookups"); static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits, "Number of '.' hits"); static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits, "Number of '..' hits"); static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks, "Number of checks in lookup"); static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss, "Number of cache misses"); static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap, "Number of cache misses we do not want to cache"); static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps, "Number of cache hits (positive) we do not want to cache"); static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits, "Number of cache hits (positive)"); static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps, "Number of cache hits (negative) we do not want to cache"); static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits, "Number of cache hits (negative)"); static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades, "Number of updates of the cache after lookup (write lock + retry)"); SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE, &nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics"); static void cache_zap(struct namecache *ncp); static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen); static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); #ifdef DIAGNOSTIC /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); CACHE_RLOCK(); if (n_nchash != nchash + 1) { CACHE_RUNLOCK(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) LIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; CACHE_RUNLOCK(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); CACHE_RLOCK(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; CACHE_RUNLOCK(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * cache_zap(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap(ncp) struct namecache *ncp; { struct vnode *vp; rw_assert(&cache_lock, RA_WLOCKED); CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); -#ifdef KDTRACE_HOOKS if (ncp->nc_vp != NULL) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, nc_get_name(ncp), ncp->nc_vp); } else { SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, nc_get_name(ncp)); } -#endif vp = NULL; LIST_REMOVE(ncp, nc_hash); if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == ncp->nc_dvp->v_cache_dd) ncp->nc_dvp->v_cache_dd = NULL; } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { vp = ncp->nc_dvp; numcachehv--; } } if (ncp->nc_vp) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); if (ncp == ncp->nc_vp->v_cache_dd) ncp->nc_vp->v_cache_dd = NULL; } else { TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; } numcache--; cache_free(ncp); if (vp) vdrop(vp); } /* * Lookup an entry in the cache * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. If the lookup * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative cacheing), a status of ENOENT is returned. If the lookup * fails, a status of zero is returned. If the directory vnode is * recycled out from under us due to a forced unmount, a status of * ENOENT is returned. * * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is * unlocked. If we're looking up . an extra ref is taken, but the lock is * not recursively acquired. */ int cache_lookup(dvp, vpp, cnp, tsp, ticksp) struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; struct timespec *tsp; int *ticksp; { struct namecache *ncp; uint32_t hash; int error, ltype, wlocked; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } retry: CACHE_RLOCK(); wlocked = 0; numcalls++; error = 0; retry_wlocked: if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { *vpp = dvp; CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", dvp, cnp->cn_nameptr); dothits++; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; goto success; } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { dotdothits++; if (dvp->v_cache_dd == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); goto unlock; } if ((cnp->cn_flags & MAKEENTRY) == 0) { if (!wlocked && !CACHE_UPGRADE_LOCK()) goto wlock; if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) cache_zap(dvp->v_cache_dd); dvp->v_cache_dd = NULL; CACHE_WUNLOCK(); return (0); } ncp = dvp->v_cache_dd; if (ncp->nc_flag & NCF_ISDOTDOT) *vpp = ncp->nc_vp; else *vpp = ncp->nc_dvp; /* Return failure if negative entry was found. */ if (*vpp == NULL) goto negative_success; CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", dvp, cnp->cn_nameptr, *vpp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) *tsp = ((struct namecache_ts *)ncp)-> nc_dotdottime; goto success; } } hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { numchecks++; if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); if ((cnp->cn_flags & MAKEENTRY) == 0) { nummisszap++; } else { nummiss++; } nchstats.ncs_miss++; goto unlock; } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { numposzaps++; nchstats.ncs_badhits++; if (!wlocked && !CACHE_UPGRADE_LOCK()) goto wlock; cache_zap(ncp); CACHE_WUNLOCK(); return (0); } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp) { numposhits++; nchstats.ncs_goodhits++; *vpp = ncp->nc_vp; CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", dvp, cnp->cn_nameptr, *vpp, ncp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp), *vpp); cache_out_ts(ncp, tsp, ticksp); goto success; } negative_success: /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { numnegzaps++; nchstats.ncs_badhits++; if (!wlocked && !CACHE_UPGRADE_LOCK()) goto wlock; cache_zap(ncp); CACHE_WUNLOCK(); return (0); } if (!wlocked && !CACHE_UPGRADE_LOCK()) goto wlock; numneghits++; /* * We found a "negative" match, so we shift it to the end of * the "negative" cache entries queue to satisfy LRU. Also, * check to see if the entry is a whiteout; indicate this to * the componentname, if so. */ TAILQ_REMOVE(&ncneg, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); nchstats.ncs_neghits++; if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, nc_get_name(ncp)); cache_out_ts(ncp, tsp, ticksp); CACHE_WUNLOCK(); return (ENOENT); wlock: /* * We need to update the cache after our lookup, so upgrade to * a write lock and retry the operation. */ CACHE_RUNLOCK(); CACHE_WLOCK(); numupgrades++; wlocked = 1; goto retry_wlocked; success: /* * On success we return a locked and ref'd vnode as per the lookup * protocol. */ if (dvp == *vpp) { /* lookup on "." */ VREF(*vpp); if (wlocked) CACHE_WUNLOCK(); else CACHE_RUNLOCK(); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if ((*vpp)->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } ltype = 0; /* silence gcc warning */ if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); } vhold(*vpp); if (wlocked) CACHE_WUNLOCK(); else CACHE_RUNLOCK(); error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); if (dvp->v_iflag & VI_DOOMED) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } } if (error) { *vpp = NULL; goto retry; } if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); } return (-1); unlock: if (wlocked) CACHE_WUNLOCK(); else CACHE_RUNLOCK(); return (0); } /* * Add an entry to the cache. */ void cache_enter_time(dvp, vp, cnp, tsp, dtsp) struct vnode *dvp; struct vnode *vp; struct componentname *cnp; struct timespec *tsp; struct timespec *dtsp; { struct namecache *ncp, *n2; struct namecache_ts *n3; struct nchashhead *ncpp; uint32_t hash; int flag; int hold; int zap; int len; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, ("cache_enter: Adding a doomed vnode")); VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, ("cache_enter: Doomed vnode used as src")); if (!doingcache) return; /* * Avoid blowout in namecache entries. */ if (numcache >= desiredvnodes * ncsizefactor) return; flag = 0; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { CACHE_WLOCK(); /* * If dotdot entry already exists, just retarget it * to new parent vnode, otherwise continue with new * namecache entry allocation. */ if ((ncp = dvp->v_cache_dd) != NULL && ncp->nc_flag & NCF_ISDOTDOT) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); if (ncp->nc_vp != NULL) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); } else { TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; } if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); } else { TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); numneg++; } ncp->nc_vp = vp; CACHE_WUNLOCK(); return; } dvp->v_cache_dd = NULL; SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); CACHE_WUNLOCK(); flag = NCF_ISDOTDOT; } } hold = 0; zap = 0; /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); ncp->nc_vp = vp; ncp->nc_dvp = dvp; ncp->nc_flag = flag; if (tsp != NULL) { n3 = (struct namecache_ts *)ncp; n3->nc_time = *tsp; n3->nc_ticks = ticks; n3->nc_flag |= NCF_TS; if (dtsp != NULL) { n3->nc_dotdottime = *dtsp; n3->nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT); strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); CACHE_WLOCK(); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); LIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) { if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n3 = (struct namecache_ts *)n2; n3->nc_time = ((struct namecache_ts *)ncp)->nc_time; n3->nc_ticks = ((struct namecache_ts *)ncp)->nc_ticks; if (dtsp != NULL) { n3->nc_dotdottime = ((struct namecache_ts *)ncp)-> nc_dotdottime; n3->nc_flag |= NCF_DTS; } } CACHE_WUNLOCK(); cache_free(ncp); return; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) { CACHE_WUNLOCK(); cache_free(ncp); return; } KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); dvp->v_cache_dd = ncp; } numcache++; if (!vp) { numneg++; if (cnp->cn_flags & ISWHITEOUT) ncp->nc_flag |= NCF_WHITE; } else if (vp->v_type == VDIR) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ if ((n2 = vp->v_cache_dd) != NULL && (n2->nc_flag & NCF_ISDOTDOT) != 0) cache_zap(n2); vp->v_cache_dd = ncp; } } else { vp->v_cache_dd = NULL; } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ LIST_INSERT_HEAD(ncpp, ncp, nc_hash); if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { hold = 1; numcachehv++; } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp), vp); } else { TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, nc_get_name(ncp)); } if (numneg * ncnegfactor > numcache) { ncp = TAILQ_FIRST(&ncneg); KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg", ncp, ncp->nc_vp)); zap = 1; } if (hold) vhold(dvp); if (zap) cache_zap(ncp); CACHE_WUNLOCK(); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { TAILQ_INIT(&ncneg); cache_zone_small = uma_zcreate("S VFS Cache", sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", sizeof(struct namecache) + NAME_MAX + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", sizeof(struct namecache_ts) + NAME_MAX + 1, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_changesize(int newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl; u_long new_nchash, old_nchash; struct namecache *ncp; uint32_t hash; int i; new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { free(new_nchashtbl, M_VFSCACHE); return; } /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ CACHE_WLOCK(); old_nchashtbl = nchashtbl; old_nchash = nchash; nchashtbl = new_nchashtbl; nchash = new_nchash; for (i = 0; i <= old_nchash; i++) { while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = fnv_32_buf(nc_get_name(ncp), ncp->nc_nlen, FNV1_32_INIT); hash = fnv_32_buf(&ncp->nc_dvp, sizeof(ncp->nc_dvp), hash); LIST_REMOVE(ncp, nc_hash); LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } CACHE_WUNLOCK(); free(old_nchashtbl, M_VFSCACHE); } /* * Invalidate all entries to a particular vnode. */ void cache_purge(vp) struct vnode *vp; { CTR1(KTR_VFS, "cache_purge(%p)", vp); SDT_PROBE1(vfs, namecache, purge, done, vp); CACHE_WLOCK(); while (!LIST_EMPTY(&vp->v_cache_src)) cache_zap(LIST_FIRST(&vp->v_cache_src)); while (!TAILQ_EMPTY(&vp->v_cache_dst)) cache_zap(TAILQ_FIRST(&vp->v_cache_dst)); if (vp->v_cache_dd != NULL) { KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); cache_zap(vp->v_cache_dd); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); CACHE_WUNLOCK(); } /* * Invalidate all negative entries for a particular directory vnode. */ void cache_purge_negative(vp) struct vnode *vp; { struct namecache *cp, *ncp; CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); SDT_PROBE1(vfs, namecache, purge_negative, done, vp); CACHE_WLOCK(); LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) { if (cp->nc_vp == NULL) cache_zap(cp); } CACHE_WUNLOCK(); } /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(mp) struct mount *mp; { struct nchashhead *ncpp; struct namecache *ncp, *nnp; /* Scan hash tables for applicable entries */ SDT_PROBE1(vfs, namecache, purgevfs, done, mp); CACHE_WLOCK(); for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) { if (ncp->nc_dvp->v_mount == mp) cache_zap(ncp); } } CACHE_WUNLOCK(); } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = VOP_ACCESS(dvp, VEXEC, cred, td); if (error) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* * XXX All of these sysctls would probably be more productive dead. */ static int disablecwd; SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "Disable the getcwd syscall"); /* Implementation of the getcwd syscall. */ int sys___getcwd(td, uap) struct thread *td; struct __getcwd_args *uap; { return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, MAXPATHLEN)); } int kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen, u_int path_max) { char *bp, *tmpbuf; struct filedesc *fdp; struct vnode *cdir, *rdir; int error; if (disablecwd) return (ENODEV); if (buflen < 2) return (EINVAL); if (buflen > path_max) buflen = path_max; tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); cdir = fdp->fd_cdir; VREF(cdir); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); vrele(rdir); vrele(cdir); if (!error) { if (bufseg == UIO_SYSSPACE) bcopy(bp, buf, strlen(bp) + 1); else error = copyout(bp, buf, strlen(bp) + 1); #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(bp); #endif } free(tmpbuf, M_TEMP); return (error); } /* * Thus begins the fullpath magic. */ #undef STATNODE #define STATNODE(name, descr) \ static u_int name; \ SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr) static int disablefullpath; SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, "Disable the vn_fullpath function"); /* These count for kern___getcwd(), too. */ STATNODE(numfullpathcalls, "Number of fullpath search calls"); STATNODE(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE(numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE(numfullpathfound, "Number of successful fullpath calls"); /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; struct filedesc *fdp; struct vnode *rdir; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); vrele(rdir); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) { int error; CACHE_RLOCK(); error = vn_vptocnp_locked(vp, cred, buf, buflen); if (error == 0) CACHE_RUNLOCK(); return (error); } static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) { struct vnode *dvp; struct namecache *ncp; int error; TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { CACHE_RUNLOCK(); vrele(*vp); numfullpathfail4++; error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, nc_get_name(ncp), vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); CACHE_RUNLOCK(); vrele(dvp); CACHE_RLOCK(); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); CACHE_RUNLOCK(); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); vput(*vp); if (error) { numfullpathfail2++; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; CACHE_RLOCK(); if (dvp->v_iflag & VI_DOOMED) { /* forced unmount */ CACHE_RUNLOCK(); vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * The magic behind kern___getcwd() and vn_fullpath(). */ static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen) { int error, slash_prefixed; #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; buflen--; buf[buflen] = '\0'; error = 0; slash_prefixed = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); numfullpathcalls++; vref(vp); CACHE_RLOCK(); if (vp->v_type != VDIR) { error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); if (error) return (error); if (buflen == 0) { CACHE_RUNLOCK(); vrele(vp); return (ENOMEM); } buf[--buflen] = '/'; slash_prefixed = 1; } while (vp != rdir && vp != rootvnode) { if (vp->v_vflag & VV_ROOT) { if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ CACHE_RUNLOCK(); vrele(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vp1 = vp->v_mount->mnt_vnodecovered; vref(vp1); CACHE_RUNLOCK(); vrele(vp); vp = vp1; CACHE_RLOCK(); continue; } if (vp->v_type != VDIR) { CACHE_RUNLOCK(); vrele(vp); numfullpathfail1++; error = ENOTDIR; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); if (error) break; if (buflen == 0) { CACHE_RUNLOCK(); vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = 1; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { CACHE_RUNLOCK(); vrele(vp); numfullpathfail4++; SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } numfullpathfound++; CACHE_RUNLOCK(); vrele(vp); SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); *retbuf = buf + buflen; return (0); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); CACHE_RLOCK(); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vhold(ddvp); CACHE_RUNLOCK(); if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) return (NULL); return (ddvp); } CACHE_RUNLOCK(); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; int l; CACHE_RLOCK(); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { CACHE_RUNLOCK(); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, nc_get_name(ncp), l); CACHE_RUNLOCK(); buf[l] = '\0'; return (0); } /* ABI compat shims for old kernel modules. */ #undef cache_enter void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { cache_enter_time(dvp, vp, cnp, NULL, NULL); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If sysctl debug.disablefullpath is set, ENODEV is returned, * vnode is left locked and path remain untouched. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Return ENODEV if sysctl debug.disablefullpath==1 */ if (disablefullpath) return (ENODEV); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp, 0); error = vn_fullpath_global(td, vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path, td); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE(&nd, NDF_ONLY_PNBUF); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } Index: head/sys/netinet/in_kdtrace.c =================================================================== --- head/sys/netinet/in_kdtrace.c (revision 292383) +++ head/sys/netinet/in_kdtrace.c (revision 292384) @@ -1,147 +1,147 @@ /*- * Copyright (c) 2013 Mark Johnston * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include SDT_PROVIDER_DEFINE(ip); SDT_PROVIDER_DEFINE(tcp); SDT_PROVIDER_DEFINE(udp); SDT_PROBE_DEFINE6_XLATE(ip, , , receive, "void *", "pktinfo_t *", "void *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct ifnet *", "ifinfo_t *", "struct ip *", "ipv4info_t *", "struct ip6_hdr *", "ipv6info_t *"); SDT_PROBE_DEFINE6_XLATE(ip, , , send, "void *", "pktinfo_t *", "void *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct ifnet *", "ifinfo_t *", "struct ip *", "ipv4info_t *", "struct ip6_hdr *", "ipv6info_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__established, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__refused, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__established, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__refused, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__request, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , receive, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , send, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE1_XLATE(tcp, , , siftr, "struct pkt_node *", "siftrinfo_t *"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__input, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *", "uint8_t *", "ipinfo_t *"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__output, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *", "uint8_t *", "ipinfo_t *"); SDT_PROBE_DEFINE2_XLATE(tcp, , , debug__user, "struct tcpcb *", "tcpsinfo_t *" , "int", "int"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__drop, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *", - "uint8_t *", "ipinfo_t *") + "uint8_t *", "ipinfo_t *"); SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change, "void *", "void *", "struct tcpcb *", "csinfo_t *", "void *", "void *", "struct tcpcb *", "tcpsinfo_t *", "void *", "void *", "int", "tcplsinfo_t *"); SDT_PROBE_DEFINE5_XLATE(udp, , , receive, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udpsinfo_t *", "struct udphdr *", "udpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(udp, , , send, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udpsinfo_t *", "struct udphdr *", "udpinfo_t *"); Index: head/sys/netinet/in_kdtrace.h =================================================================== --- head/sys/netinet/in_kdtrace.h (revision 292383) +++ head/sys/netinet/in_kdtrace.h (revision 292384) @@ -1,72 +1,72 @@ /*- * Copyright (c) 2013 Mark Johnston * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_IN_KDTRACE_H_ #define _SYS_IN_KDTRACE_H_ #define IP_PROBE(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ SDT_PROBE6(ip, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) #define UDP_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(udp, , , probe, arg0, arg1, arg2, arg3, arg4) -#define TCP_PROBE1(probe, arg0) \ +#define TCP_PROBE1(probe, arg0) \ SDT_PROBE1(tcp, , , probe, arg0) -#define TCP_PROBE2(probe, arg0, arg1) \ +#define TCP_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(tcp, , , probe, arg0, arg1) -#define TCP_PROBE3(probe, arg0, arg1, arg2) \ +#define TCP_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(tcp, , , probe, arg0, arg1, arg2) -#define TCP_PROBE4(probe, arg0, arg1, arg2, arg3) \ +#define TCP_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(tcp, , , probe, arg0, arg1, arg2, arg3) #define TCP_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(tcp, , , probe, arg0, arg1, arg2, arg3, arg4) #define TCP_PROBE6(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ SDT_PROBE6(tcp, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) SDT_PROVIDER_DECLARE(ip); SDT_PROVIDER_DECLARE(tcp); SDT_PROVIDER_DECLARE(udp); SDT_PROBE_DECLARE(ip, , , receive); SDT_PROBE_DECLARE(ip, , , send); SDT_PROBE_DECLARE(tcp, , , accept__established); SDT_PROBE_DECLARE(tcp, , , accept__refused); SDT_PROBE_DECLARE(tcp, , , connect__established); SDT_PROBE_DECLARE(tcp, , , connect__refused); SDT_PROBE_DECLARE(tcp, , , connect__request); SDT_PROBE_DECLARE(tcp, , , receive); SDT_PROBE_DECLARE(tcp, , , send); SDT_PROBE_DECLARE(tcp, , , siftr); SDT_PROBE_DECLARE(tcp, , , state__change); SDT_PROBE_DECLARE(tcp, , , debug__input); SDT_PROBE_DECLARE(tcp, , , debug__output); SDT_PROBE_DECLARE(tcp, , , debug__user); SDT_PROBE_DECLARE(tcp, , , debug__drop); SDT_PROBE_DECLARE(udp, , , receive); SDT_PROBE_DECLARE(udp, , , send); #endif Index: head/sys/netinet/sctp_cc_functions.c =================================================================== --- head/sys/netinet/sctp_cc_functions.c (revision 292383) +++ head/sys/netinet/sctp_cc_functions.c (revision 292384) @@ -1,2373 +1,2373 @@ /*- * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SHIFT_MPTCP_MULTI_N 40 #define SHIFT_MPTCP_MULTI_Z 16 #define SHIFT_MPTCP_MULTI 8 static void sctp_enforce_cwnd_limit(struct sctp_association *assoc, struct sctp_nets *net) { if ((assoc->max_cwnd > 0) && (net->cwnd > assoc->max_cwnd) && (net->cwnd > (net->mtu - sizeof(struct sctphdr)))) { net->cwnd = assoc->max_cwnd; if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) { net->cwnd = net->mtu - sizeof(struct sctphdr); } } } static void sctp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_association *assoc; uint32_t cwnd_in_mtu; assoc = &stcb->asoc; cwnd_in_mtu = SCTP_BASE_SYSCTL(sctp_initial_cwnd); if (cwnd_in_mtu == 0) { /* Using 0 means that the value of RFC 4960 is used. */ net->cwnd = min((net->mtu * 4), max((2 * net->mtu), SCTP_INITIAL_CWND)); } else { /* * We take the minimum of the burst limit and the initial * congestion window. */ if ((assoc->max_burst > 0) && (cwnd_in_mtu > assoc->max_burst)) cwnd_in_mtu = assoc->max_burst; net->cwnd = (net->mtu - sizeof(struct sctphdr)) * cwnd_in_mtu; } if ((stcb->asoc.sctp_cmt_on_off == SCTP_CMT_RPV1) || (stcb->asoc.sctp_cmt_on_off == SCTP_CMT_RPV2)) { /* In case of resource pooling initialize appropriately */ net->cwnd /= assoc->numnets; if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) { net->cwnd = net->mtu - sizeof(struct sctphdr); } } sctp_enforce_cwnd_limit(assoc, net); net->ssthresh = assoc->peers_rwnd; - SDT_PROBE(sctp, cwnd, net, init, + SDT_PROBE5(sctp, cwnd, net, init, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, 0, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { sctp_log_cwnd(stcb, net, 0, SCTP_CWND_INITIALIZATION); } } static void sctp_cwnd_update_after_fr(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_nets *net; uint32_t t_ssthresh, t_cwnd; uint64_t t_ucwnd_sbw; /* MT FIXME: Don't compute this over and over again */ t_ssthresh = 0; t_cwnd = 0; t_ucwnd_sbw = 0; if ((asoc->sctp_cmt_on_off == SCTP_CMT_RPV1) || (asoc->sctp_cmt_on_off == SCTP_CMT_RPV2)) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { t_ssthresh += net->ssthresh; t_cwnd += net->cwnd; if (net->lastsa > 0) { t_ucwnd_sbw += (uint64_t) net->cwnd / (uint64_t) net->lastsa; } } if (t_ucwnd_sbw == 0) { t_ucwnd_sbw = 1; } } /*- * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off > 0) && * (net->fast_retran_loss_recovery == 0))) */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if ((asoc->fast_retran_loss_recovery == 0) || (asoc->sctp_cmt_on_off > 0)) { /* out of a RFC2582 Fast recovery window? */ if (net->net_ack > 0) { /* * per section 7.2.3, are there any * destinations that had a fast retransmit * to them. If so what we need to do is * adjust ssthresh and cwnd. */ struct sctp_tmit_chunk *lchk; int old_cwnd = net->cwnd; if ((asoc->sctp_cmt_on_off == SCTP_CMT_RPV1) || (asoc->sctp_cmt_on_off == SCTP_CMT_RPV2)) { if (asoc->sctp_cmt_on_off == SCTP_CMT_RPV1) { net->ssthresh = (uint32_t) (((uint64_t) 4 * (uint64_t) net->mtu * (uint64_t) net->ssthresh) / (uint64_t) t_ssthresh); } if (asoc->sctp_cmt_on_off == SCTP_CMT_RPV2) { uint32_t srtt; srtt = net->lastsa; /* * lastsa>>3; we don't need * to devide ... */ if (srtt == 0) { srtt = 1; } /* * Short Version => Equal to * Contel Version MBe */ net->ssthresh = (uint32_t) (((uint64_t) 4 * (uint64_t) net->mtu * (uint64_t) net->cwnd) / ((uint64_t) srtt * t_ucwnd_sbw)); /* INCREASE FACTOR */ ; } if ((net->cwnd > t_cwnd / 2) && (net->ssthresh < net->cwnd - t_cwnd / 2)) { net->ssthresh = net->cwnd - t_cwnd / 2; } if (net->ssthresh < net->mtu) { net->ssthresh = net->mtu; } } else { net->ssthresh = net->cwnd / 2; if (net->ssthresh < (net->mtu * 2)) { net->ssthresh = 2 * net->mtu; } } net->cwnd = net->ssthresh; sctp_enforce_cwnd_limit(asoc, net); - SDT_PROBE(sctp, cwnd, net, fr, + SDT_PROBE5(sctp, cwnd, net, fr, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_FR); } lchk = TAILQ_FIRST(&asoc->send_queue); net->partial_bytes_acked = 0; /* Turn on fast recovery window */ asoc->fast_retran_loss_recovery = 1; if (lchk == NULL) { /* Mark end of the window */ asoc->fast_recovery_tsn = asoc->sending_seq - 1; } else { asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; } /* * CMT fast recovery -- per destination * recovery variable. */ net->fast_retran_loss_recovery = 1; if (lchk == NULL) { /* Mark end of the window */ net->fast_recovery_tsn = asoc->sending_seq - 1; } else { net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; } sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_1); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net); } } else if (net->net_ack > 0) { /* * Mark a peg that we WOULD have done a cwnd * reduction but RFC2582 prevented this action. */ SCTP_STAT_INCR(sctps_fastretransinrtt); } } } /* Defines for instantaneous bw decisions */ #define SCTP_INST_LOOSING 1 /* Loosing to other flows */ #define SCTP_INST_NEUTRAL 2 /* Neutral, no indication */ #define SCTP_INST_GAINING 3 /* Gaining, step down possible */ static int cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint64_t rtt_offset, uint64_t vtag, uint8_t inst_ind) { uint64_t oth, probepoint; probepoint = (((uint64_t) net->cwnd) << 32); if (net->rtt > net->cc_mod.rtcc.lbw_rtt + rtt_offset) { /* * rtt increased we don't update bw.. so we don't update the * rtt either. */ /* Probe point 5 */ probepoint |= ((5 << 16) | 1); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); if ((net->cc_mod.rtcc.steady_step) && (inst_ind != SCTP_INST_LOOSING)) { if (net->cc_mod.rtcc.last_step_state == 5) net->cc_mod.rtcc.step_cnt++; else net->cc_mod.rtcc.step_cnt = 1; net->cc_mod.rtcc.last_step_state = 5; if ((net->cc_mod.rtcc.step_cnt == net->cc_mod.rtcc.steady_step) || ((net->cc_mod.rtcc.step_cnt > net->cc_mod.rtcc.steady_step) && ((net->cc_mod.rtcc.step_cnt % net->cc_mod.rtcc.steady_step) == 0))) { /* Try a step down */ oth = net->cc_mod.rtcc.vol_reduce; oth <<= 16; oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), oth, probepoint); if (net->cwnd > (4 * net->mtu)) { net->cwnd -= net->mtu; net->cc_mod.rtcc.vol_reduce++; } else { net->cc_mod.rtcc.step_cnt = 0; } } } return (1); } if (net->rtt < net->cc_mod.rtcc.lbw_rtt - rtt_offset) { /* * rtt decreased, there could be more room. we update both * the bw and the rtt here to lock this in as a good step * down. */ /* Probe point 6 */ probepoint |= ((6 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); if (net->cc_mod.rtcc.steady_step) { oth = net->cc_mod.rtcc.vol_reduce; oth <<= 16; oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), oth, probepoint); if ((net->cc_mod.rtcc.last_step_state == 5) && (net->cc_mod.rtcc.step_cnt > net->cc_mod.rtcc.steady_step)) { /* Step down worked */ net->cc_mod.rtcc.step_cnt = 0; return (1); } else { net->cc_mod.rtcc.last_step_state = 6; net->cc_mod.rtcc.step_cnt = 0; } } net->cc_mod.rtcc.lbw = nbw; net->cc_mod.rtcc.lbw_rtt = net->rtt; net->cc_mod.rtcc.cwnd_at_bw_set = net->cwnd; if (inst_ind == SCTP_INST_GAINING) return (1); else if (inst_ind == SCTP_INST_NEUTRAL) return (1); else return (0); } /* * Ok bw and rtt remained the same .. no update to any */ /* Probe point 7 */ probepoint |= ((7 << 16) | net->cc_mod.rtcc.ret_from_eq); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); if ((net->cc_mod.rtcc.steady_step) && (inst_ind != SCTP_INST_LOOSING)) { if (net->cc_mod.rtcc.last_step_state == 5) net->cc_mod.rtcc.step_cnt++; else net->cc_mod.rtcc.step_cnt = 1; net->cc_mod.rtcc.last_step_state = 5; if ((net->cc_mod.rtcc.step_cnt == net->cc_mod.rtcc.steady_step) || ((net->cc_mod.rtcc.step_cnt > net->cc_mod.rtcc.steady_step) && ((net->cc_mod.rtcc.step_cnt % net->cc_mod.rtcc.steady_step) == 0))) { /* Try a step down */ if (net->cwnd > (4 * net->mtu)) { net->cwnd -= net->mtu; net->cc_mod.rtcc.vol_reduce++; return (1); } else { net->cc_mod.rtcc.step_cnt = 0; } } } if (inst_ind == SCTP_INST_GAINING) return (1); else if (inst_ind == SCTP_INST_NEUTRAL) return (1); else return ((int)net->cc_mod.rtcc.ret_from_eq); } static int cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint64_t rtt_offset, uint64_t vtag, uint8_t inst_ind) { uint64_t oth, probepoint; /* Bandwidth decreased. */ probepoint = (((uint64_t) net->cwnd) << 32); if (net->rtt > net->cc_mod.rtcc.lbw_rtt + rtt_offset) { /* rtt increased */ /* Did we add more */ if ((net->cwnd > net->cc_mod.rtcc.cwnd_at_bw_set) && (inst_ind != SCTP_INST_LOOSING)) { /* We caused it maybe.. back off? */ /* PROBE POINT 1 */ probepoint |= ((1 << 16) | 1); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); if (net->cc_mod.rtcc.ret_from_eq) { /* * Switch over to CA if we are less * aggressive */ net->ssthresh = net->cwnd - 1; net->partial_bytes_acked = 0; } return (1); } /* Probe point 2 */ probepoint |= ((2 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); /* Someone else - fight for more? */ if (net->cc_mod.rtcc.steady_step) { oth = net->cc_mod.rtcc.vol_reduce; oth <<= 16; oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), oth, probepoint); /* * Did we voluntarily give up some? if so take one * back please */ if ((net->cc_mod.rtcc.vol_reduce) && (inst_ind != SCTP_INST_GAINING)) { net->cwnd += net->mtu; sctp_enforce_cwnd_limit(&stcb->asoc, net); net->cc_mod.rtcc.vol_reduce--; } net->cc_mod.rtcc.last_step_state = 2; net->cc_mod.rtcc.step_cnt = 0; } goto out_decision; } else if (net->rtt < net->cc_mod.rtcc.lbw_rtt - rtt_offset) { /* bw & rtt decreased */ /* Probe point 3 */ probepoint |= ((3 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); if (net->cc_mod.rtcc.steady_step) { oth = net->cc_mod.rtcc.vol_reduce; oth <<= 16; oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), oth, probepoint); if ((net->cc_mod.rtcc.vol_reduce) && (inst_ind != SCTP_INST_GAINING)) { net->cwnd += net->mtu; sctp_enforce_cwnd_limit(&stcb->asoc, net); net->cc_mod.rtcc.vol_reduce--; } net->cc_mod.rtcc.last_step_state = 3; net->cc_mod.rtcc.step_cnt = 0; } goto out_decision; } /* The bw decreased but rtt stayed the same */ /* Probe point 4 */ probepoint |= ((4 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); if (net->cc_mod.rtcc.steady_step) { oth = net->cc_mod.rtcc.vol_reduce; oth <<= 16; oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), oth, probepoint); if ((net->cc_mod.rtcc.vol_reduce) && (inst_ind != SCTP_INST_GAINING)) { net->cwnd += net->mtu; sctp_enforce_cwnd_limit(&stcb->asoc, net); net->cc_mod.rtcc.vol_reduce--; } net->cc_mod.rtcc.last_step_state = 4; net->cc_mod.rtcc.step_cnt = 0; } out_decision: net->cc_mod.rtcc.lbw = nbw; net->cc_mod.rtcc.lbw_rtt = net->rtt; net->cc_mod.rtcc.cwnd_at_bw_set = net->cwnd; if (inst_ind == SCTP_INST_GAINING) { return (1); } else { return (0); } } static int cc_bw_increase(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint64_t vtag) { uint64_t oth, probepoint; /* * BW increased, so update and return 0, since all actions in our * table say to do the normal CC update. Note that we pay no * attention to the inst_ind since our overall sum is increasing. */ /* PROBE POINT 0 */ probepoint = (((uint64_t) net->cwnd) << 32); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); if (net->cc_mod.rtcc.steady_step) { oth = net->cc_mod.rtcc.vol_reduce; oth <<= 16; oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), oth, probepoint); net->cc_mod.rtcc.last_step_state = 0; net->cc_mod.rtcc.step_cnt = 0; net->cc_mod.rtcc.vol_reduce = 0; } net->cc_mod.rtcc.lbw = nbw; net->cc_mod.rtcc.lbw_rtt = net->rtt; net->cc_mod.rtcc.cwnd_at_bw_set = net->cwnd; return (0); } /* RTCC Algoritm to limit growth of cwnd, return * true if you want to NOT allow cwnd growth */ static int cc_bw_limit(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw) { uint64_t bw_offset, rtt_offset; uint64_t probepoint, rtt, vtag; uint64_t bytes_for_this_rtt, inst_bw; uint64_t div, inst_off; int bw_shift; uint8_t inst_ind; int ret; /*- * Here we need to see if we want * to limit cwnd growth due to increase * in overall rtt but no increase in bw. * We use the following table to figure * out what we should do. When we return * 0, cc update goes on as planned. If we * return 1, then no cc update happens and cwnd * stays where it is at. * ---------------------------------- * BW | RTT | Action * ********************************* * INC | INC | return 0 * ---------------------------------- * INC | SAME | return 0 * ---------------------------------- * INC | DECR | return 0 * ---------------------------------- * SAME | INC | return 1 * ---------------------------------- * SAME | SAME | return 1 * ---------------------------------- * SAME | DECR | return 0 * ---------------------------------- * DECR | INC | return 0 or 1 based on if we caused. * ---------------------------------- * DECR | SAME | return 0 * ---------------------------------- * DECR | DECR | return 0 * ---------------------------------- * * We are a bit fuzz on what an increase or * decrease is. For BW it is the same if * it did not change within 1/64th. For * RTT it stayed the same if it did not * change within 1/32nd */ bw_shift = SCTP_BASE_SYSCTL(sctp_rttvar_bw); rtt = stcb->asoc.my_vtag; vtag = (rtt << 32) | (((uint32_t) (stcb->sctp_ep->sctp_lport)) << 16) | (stcb->rport); probepoint = (((uint64_t) net->cwnd) << 32); rtt = net->rtt; if (net->cc_mod.rtcc.rtt_set_this_sack) { net->cc_mod.rtcc.rtt_set_this_sack = 0; bytes_for_this_rtt = net->cc_mod.rtcc.bw_bytes - net->cc_mod.rtcc.bw_bytes_at_last_rttc; net->cc_mod.rtcc.bw_bytes_at_last_rttc = net->cc_mod.rtcc.bw_bytes; if (net->rtt) { div = net->rtt / 1000; if (div) { inst_bw = bytes_for_this_rtt / div; inst_off = inst_bw >> bw_shift; if (inst_bw > nbw) inst_ind = SCTP_INST_GAINING; else if ((inst_bw + inst_off) < nbw) inst_ind = SCTP_INST_LOOSING; else inst_ind = SCTP_INST_NEUTRAL; probepoint |= ((0xb << 16) | inst_ind); } else { inst_ind = net->cc_mod.rtcc.last_inst_ind; inst_bw = bytes_for_this_rtt / (uint64_t) (net->rtt); /* Can't determine do not change */ probepoint |= ((0xc << 16) | inst_ind); } } else { inst_ind = net->cc_mod.rtcc.last_inst_ind; inst_bw = bytes_for_this_rtt; /* Can't determine do not change */ probepoint |= ((0xd << 16) | inst_ind); } - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((nbw << 32) | inst_bw), ((net->cc_mod.rtcc.lbw_rtt << 32) | rtt), net->flight_size, probepoint); } else { /* No rtt measurement, use last one */ inst_ind = net->cc_mod.rtcc.last_inst_ind; } bw_offset = net->cc_mod.rtcc.lbw >> bw_shift; if (nbw > net->cc_mod.rtcc.lbw + bw_offset) { ret = cc_bw_increase(stcb, net, nbw, vtag); goto out; } rtt_offset = net->cc_mod.rtcc.lbw_rtt >> SCTP_BASE_SYSCTL(sctp_rttvar_rtt); if (nbw < net->cc_mod.rtcc.lbw - bw_offset) { ret = cc_bw_decrease(stcb, net, nbw, rtt_offset, vtag, inst_ind); goto out; } /* * If we reach here then we are in a situation where the bw stayed * the same. */ ret = cc_bw_same(stcb, net, nbw, rtt_offset, vtag, inst_ind); out: net->cc_mod.rtcc.last_inst_ind = inst_ind; return (ret); } static void sctp_cwnd_update_after_sack_common(struct sctp_tcb *stcb, struct sctp_association *asoc, int accum_moved, int reneged_all SCTP_UNUSED, int will_exit, int use_rtcc) { struct sctp_nets *net; int old_cwnd; uint32_t t_ssthresh, t_cwnd, incr; uint64_t t_ucwnd_sbw; uint64_t t_path_mptcp; uint64_t mptcp_like_alpha; uint32_t srtt; uint64_t max_path; /* MT FIXME: Don't compute this over and over again */ t_ssthresh = 0; t_cwnd = 0; t_ucwnd_sbw = 0; t_path_mptcp = 0; mptcp_like_alpha = 1; if ((stcb->asoc.sctp_cmt_on_off == SCTP_CMT_RPV1) || (stcb->asoc.sctp_cmt_on_off == SCTP_CMT_RPV2) || (stcb->asoc.sctp_cmt_on_off == SCTP_CMT_MPTCP)) { max_path = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { t_ssthresh += net->ssthresh; t_cwnd += net->cwnd; /* lastsa>>3; we don't need to devide ... */ srtt = net->lastsa; if (srtt > 0) { uint64_t tmp; t_ucwnd_sbw += (uint64_t) net->cwnd / (uint64_t) srtt; t_path_mptcp += (((uint64_t) net->cwnd) << SHIFT_MPTCP_MULTI_Z) / (((uint64_t) net->mtu) * (uint64_t) srtt); tmp = (((uint64_t) net->cwnd) << SHIFT_MPTCP_MULTI_N) / ((uint64_t) net->mtu * (uint64_t) (srtt * srtt)); if (tmp > max_path) { max_path = tmp; } } } if (t_path_mptcp > 0) { mptcp_like_alpha = max_path / (t_path_mptcp * t_path_mptcp); } else { mptcp_like_alpha = 1; } } if (t_ssthresh == 0) { t_ssthresh = 1; } if (t_ucwnd_sbw == 0) { t_ucwnd_sbw = 1; } /******************************/ /* update cwnd and Early FR */ /******************************/ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { #ifdef JANA_CMT_FAST_RECOVERY /* * CMT fast recovery code. Need to debug. */ if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) { if (SCTP_TSN_GE(asoc->last_acked_seq, net->fast_recovery_tsn) || SCTP_TSN_GE(net->pseudo_cumack, net->fast_recovery_tsn)) { net->will_exit_fast_recovery = 1; } } #endif /* if nothing was acked on this destination skip it */ if (net->net_ack == 0) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK); } continue; } #ifdef JANA_CMT_FAST_RECOVERY /* * CMT fast recovery code */ /* * if (sctp_cmt_on_off > 0 && net->fast_retran_loss_recovery * && net->will_exit_fast_recovery == 0) { @@@ Do something * } else if (sctp_cmt_on_off == 0 && * asoc->fast_retran_loss_recovery && will_exit == 0) { */ #endif if (asoc->fast_retran_loss_recovery && (will_exit == 0) && (asoc->sctp_cmt_on_off == 0)) { /* * If we are in loss recovery we skip any cwnd * update */ return; } /* * Did any measurements go on for this network? */ if (use_rtcc && (net->cc_mod.rtcc.tls_needs_set > 0)) { uint64_t nbw; /* * At this point our bw_bytes has been updated by * incoming sack information. * * But our bw may not yet be set. * */ if ((net->cc_mod.rtcc.new_tot_time / 1000) > 0) { nbw = net->cc_mod.rtcc.bw_bytes / (net->cc_mod.rtcc.new_tot_time / 1000); } else { nbw = net->cc_mod.rtcc.bw_bytes; } if (net->cc_mod.rtcc.lbw) { if (cc_bw_limit(stcb, net, nbw)) { /* Hold here, no update */ continue; } } else { uint64_t vtag, probepoint; probepoint = (((uint64_t) net->cwnd) << 32); probepoint |= ((0xa << 16) | 0); vtag = (net->rtt << 32) | (((uint32_t) (stcb->sctp_ep->sctp_lport)) << 16) | (stcb->rport); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, nbw, ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); net->cc_mod.rtcc.lbw = nbw; net->cc_mod.rtcc.lbw_rtt = net->rtt; if (net->cc_mod.rtcc.rtt_set_this_sack) { net->cc_mod.rtcc.rtt_set_this_sack = 0; net->cc_mod.rtcc.bw_bytes_at_last_rttc = net->cc_mod.rtcc.bw_bytes; } } } /* * CMT: CUC algorithm. Update cwnd if pseudo-cumack has * moved. */ if (accum_moved || ((asoc->sctp_cmt_on_off > 0) && net->new_pseudo_cumack)) { /* If the cumulative ack moved we can proceed */ if (net->cwnd <= net->ssthresh) { /* We are in slow start */ if (net->flight_size + net->net_ack >= net->cwnd) { uint32_t limit; old_cwnd = net->cwnd; switch (asoc->sctp_cmt_on_off) { case SCTP_CMT_RPV1: limit = (uint32_t) (((uint64_t) net->mtu * (uint64_t) SCTP_BASE_SYSCTL(sctp_L2_abc_variable) * (uint64_t) net->ssthresh) / (uint64_t) t_ssthresh); incr = (uint32_t) (((uint64_t) net->net_ack * (uint64_t) net->ssthresh) / (uint64_t) t_ssthresh); if (incr > limit) { incr = limit; } if (incr == 0) { incr = 1; } break; case SCTP_CMT_RPV2: /* * lastsa>>3; we don't need * to divide ... */ srtt = net->lastsa; if (srtt == 0) { srtt = 1; } limit = (uint32_t) (((uint64_t) net->mtu * (uint64_t) SCTP_BASE_SYSCTL(sctp_L2_abc_variable) * (uint64_t) net->cwnd) / ((uint64_t) srtt * t_ucwnd_sbw)); /* INCREASE FACTOR */ incr = (uint32_t) (((uint64_t) net->net_ack * (uint64_t) net->cwnd) / ((uint64_t) srtt * t_ucwnd_sbw)); /* INCREASE FACTOR */ if (incr > limit) { incr = limit; } if (incr == 0) { incr = 1; } break; case SCTP_CMT_MPTCP: limit = (uint32_t) (((uint64_t) net->mtu * mptcp_like_alpha * (uint64_t) SCTP_BASE_SYSCTL(sctp_L2_abc_variable)) >> SHIFT_MPTCP_MULTI); incr = (uint32_t) (((uint64_t) net->net_ack * mptcp_like_alpha) >> SHIFT_MPTCP_MULTI); if (incr > limit) { incr = limit; } if (incr > net->net_ack) { incr = net->net_ack; } if (incr > net->mtu) { incr = net->mtu; } break; default: incr = net->net_ack; if (incr > net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable)) { incr = net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable); } break; } net->cwnd += incr; sctp_enforce_cwnd_limit(asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, incr, SCTP_CWND_LOG_FROM_SS); } - SDT_PROBE(sctp, cwnd, net, ack, + SDT_PROBE5(sctp, cwnd, net, ack, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_NOADV_SS); } } } else { /* We are in congestion avoidance */ /* * Add to pba */ net->partial_bytes_acked += net->net_ack; if ((net->flight_size + net->net_ack >= net->cwnd) && (net->partial_bytes_acked >= net->cwnd)) { net->partial_bytes_acked -= net->cwnd; old_cwnd = net->cwnd; switch (asoc->sctp_cmt_on_off) { case SCTP_CMT_RPV1: incr = (uint32_t) (((uint64_t) net->mtu * (uint64_t) net->ssthresh) / (uint64_t) t_ssthresh); if (incr == 0) { incr = 1; } break; case SCTP_CMT_RPV2: /* * lastsa>>3; we don't need * to divide ... */ srtt = net->lastsa; if (srtt == 0) { srtt = 1; } incr = (uint32_t) ((uint64_t) net->mtu * (uint64_t) net->cwnd / ((uint64_t) srtt * t_ucwnd_sbw)); /* INCREASE FACTOR */ if (incr == 0) { incr = 1; } break; case SCTP_CMT_MPTCP: incr = (uint32_t) ((mptcp_like_alpha * (uint64_t) net->cwnd) >> SHIFT_MPTCP_MULTI); if (incr > net->mtu) { incr = net->mtu; } break; default: incr = net->mtu; break; } net->cwnd += incr; sctp_enforce_cwnd_limit(asoc, net); - SDT_PROBE(sctp, cwnd, net, ack, + SDT_PROBE5(sctp, cwnd, net, ack, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_CA); } } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_NOADV_CA); } } } } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_NO_CUMACK); } } } } static void sctp_cwnd_update_exit_pf_common(struct sctp_tcb *stcb, struct sctp_nets *net) { int old_cwnd; old_cwnd = net->cwnd; net->cwnd = net->mtu; - SDT_PROBE(sctp, cwnd, net, ack, + SDT_PROBE5(sctp, cwnd, net, ack, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n", (void *)net, net->cwnd); } static void sctp_cwnd_update_after_timeout(struct sctp_tcb *stcb, struct sctp_nets *net) { int old_cwnd = net->cwnd; uint32_t t_ssthresh, t_cwnd; uint64_t t_ucwnd_sbw; /* MT FIXME: Don't compute this over and over again */ t_ssthresh = 0; t_cwnd = 0; if ((stcb->asoc.sctp_cmt_on_off == SCTP_CMT_RPV1) || (stcb->asoc.sctp_cmt_on_off == SCTP_CMT_RPV2)) { struct sctp_nets *lnet; uint32_t srtt; t_ucwnd_sbw = 0; TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { t_ssthresh += lnet->ssthresh; t_cwnd += lnet->cwnd; srtt = lnet->lastsa; /* lastsa>>3; we don't need to divide ... */ if (srtt > 0) { t_ucwnd_sbw += (uint64_t) lnet->cwnd / (uint64_t) srtt; } } if (t_ssthresh < 1) { t_ssthresh = 1; } if (t_ucwnd_sbw < 1) { t_ucwnd_sbw = 1; } if (stcb->asoc.sctp_cmt_on_off == SCTP_CMT_RPV1) { net->ssthresh = (uint32_t) (((uint64_t) 4 * (uint64_t) net->mtu * (uint64_t) net->ssthresh) / (uint64_t) t_ssthresh); } else { uint64_t cc_delta; srtt = net->lastsa; /* lastsa>>3; we don't need to divide ... */ if (srtt == 0) { srtt = 1; } cc_delta = t_ucwnd_sbw * (uint64_t) srtt / 2; if (cc_delta < t_cwnd) { net->ssthresh = (uint32_t) ((uint64_t) t_cwnd - cc_delta); } else { net->ssthresh = net->mtu; } } if ((net->cwnd > t_cwnd / 2) && (net->ssthresh < net->cwnd - t_cwnd / 2)) { net->ssthresh = net->cwnd - t_cwnd / 2; } if (net->ssthresh < net->mtu) { net->ssthresh = net->mtu; } } else { net->ssthresh = max(net->cwnd / 2, 4 * net->mtu); } net->cwnd = net->mtu; net->partial_bytes_acked = 0; - SDT_PROBE(sctp, cwnd, net, to, + SDT_PROBE5(sctp, cwnd, net, to, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->cwnd - old_cwnd, SCTP_CWND_LOG_FROM_RTX); } } static void sctp_cwnd_update_after_ecn_echo_common(struct sctp_tcb *stcb, struct sctp_nets *net, int in_window, int num_pkt_lost, int use_rtcc) { int old_cwnd = net->cwnd; if ((use_rtcc) && (net->lan_type == SCTP_LAN_LOCAL) && (net->cc_mod.rtcc.use_dccc_ecn)) { /* Data center Congestion Control */ if (in_window == 0) { /* * Go to CA with the cwnd at the point we sent the * TSN that was marked with a CE. */ if (net->ecn_prev_cwnd < net->cwnd) { /* Restore to prev cwnd */ net->cwnd = net->ecn_prev_cwnd - (net->mtu * num_pkt_lost); } else { /* Just cut in 1/2 */ net->cwnd /= 2; } /* Drop to CA */ net->ssthresh = net->cwnd - (num_pkt_lost * net->mtu); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); } } else { /* * Further tuning down required over the drastic * orginal cut */ net->ssthresh -= (net->mtu * num_pkt_lost); net->cwnd -= (net->mtu * num_pkt_lost); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); } } SCTP_STAT_INCR(sctps_ecnereducedcwnd); } else { if (in_window == 0) { SCTP_STAT_INCR(sctps_ecnereducedcwnd); net->ssthresh = net->cwnd / 2; if (net->ssthresh < net->mtu) { net->ssthresh = net->mtu; /* * here back off the timer as well, to slow * us down */ net->RTO <<= 1; } net->cwnd = net->ssthresh; - SDT_PROBE(sctp, cwnd, net, ecn, + SDT_PROBE5(sctp, cwnd, net, ecn, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); } } } } static void sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net, struct sctp_pktdrop_chunk *cp, uint32_t * bottle_bw, uint32_t * on_queue) { uint32_t bw_avail; unsigned int incr; int old_cwnd = net->cwnd; /* get bottle neck bw */ *bottle_bw = ntohl(cp->bottle_bw); /* and whats on queue */ *on_queue = ntohl(cp->current_onq); /* * adjust the on-queue if our flight is more it could be that the * router has not yet gotten data "in-flight" to it */ if (*on_queue < net->flight_size) { *on_queue = net->flight_size; } /* rtt is measured in micro seconds, bottle_bw in bytes per second */ bw_avail = (uint32_t) (((uint64_t) (*bottle_bw) * net->rtt) / (uint64_t) 1000000); if (bw_avail > *bottle_bw) { /* * Cap the growth to no more than the bottle neck. This can * happen as RTT slides up due to queues. It also means if * you have more than a 1 second RTT with a empty queue you * will be limited to the bottle_bw per second no matter if * other points have 1/2 the RTT and you could get more * out... */ bw_avail = *bottle_bw; } if (*on_queue > bw_avail) { /* * No room for anything else don't allow anything else to be * "added to the fire". */ int seg_inflight, seg_onqueue, my_portion; net->partial_bytes_acked = 0; /* how much are we over queue size? */ incr = *on_queue - bw_avail; if (stcb->asoc.seen_a_sack_this_pkt) { /* * undo any cwnd adjustment that the sack might have * made */ net->cwnd = net->prev_cwnd; } /* Now how much of that is mine? */ seg_inflight = net->flight_size / net->mtu; seg_onqueue = *on_queue / net->mtu; my_portion = (incr * seg_inflight) / seg_onqueue; /* Have I made an adjustment already */ if (net->cwnd > net->flight_size) { /* * for this flight I made an adjustment we need to * decrease the portion by a share our previous * adjustment. */ int diff_adj; diff_adj = net->cwnd - net->flight_size; if (diff_adj > my_portion) my_portion = 0; else my_portion -= diff_adj; } /* * back down to the previous cwnd (assume we have had a sack * before this packet). minus what ever portion of the * overage is my fault. */ net->cwnd -= my_portion; /* we will NOT back down more than 1 MTU */ if (net->cwnd <= net->mtu) { net->cwnd = net->mtu; } /* force into CA */ net->ssthresh = net->cwnd - 1; } else { /* * Take 1/4 of the space left or max burst up .. whichever * is less. */ incr = (bw_avail - *on_queue) >> 2; if ((stcb->asoc.max_burst > 0) && (stcb->asoc.max_burst * net->mtu < incr)) { incr = stcb->asoc.max_burst * net->mtu; } net->cwnd += incr; } if (net->cwnd > bw_avail) { /* We can't exceed the pipe size */ net->cwnd = bw_avail; } if (net->cwnd < net->mtu) { /* We always have 1 MTU */ net->cwnd = net->mtu; } sctp_enforce_cwnd_limit(&stcb->asoc, net); if (net->cwnd - old_cwnd != 0) { /* log only changes */ - SDT_PROBE(sctp, cwnd, net, pd, + SDT_PROBE5(sctp, cwnd, net, pd, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); } } } static void sctp_cwnd_update_after_output(struct sctp_tcb *stcb, struct sctp_nets *net, int burst_limit) { int old_cwnd = net->cwnd; if (net->ssthresh < net->cwnd) net->ssthresh = net->cwnd; if (burst_limit) { net->cwnd = (net->flight_size + (burst_limit * net->mtu)); sctp_enforce_cwnd_limit(&stcb->asoc, net); - SDT_PROBE(sctp, cwnd, net, bl, + SDT_PROBE5(sctp, cwnd, net, bl, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_BRST); } } } static void sctp_cwnd_update_after_sack(struct sctp_tcb *stcb, struct sctp_association *asoc, int accum_moved, int reneged_all, int will_exit) { /* Passing a zero argument in last disables the rtcc algoritm */ sctp_cwnd_update_after_sack_common(stcb, asoc, accum_moved, reneged_all, will_exit, 0); } static void sctp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net, int in_window, int num_pkt_lost) { /* Passing a zero argument in last disables the rtcc algoritm */ sctp_cwnd_update_after_ecn_echo_common(stcb, net, in_window, num_pkt_lost, 0); } /* Here starts the RTCCVAR type CC invented by RRS which * is a slight mod to RFC2581. We reuse a common routine or * two since these algoritms are so close and need to * remain the same. */ static void sctp_cwnd_update_rtcc_after_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net, int in_window, int num_pkt_lost) { sctp_cwnd_update_after_ecn_echo_common(stcb, net, in_window, num_pkt_lost, 1); } static void sctp_cwnd_update_rtcc_tsn_acknowledged(struct sctp_nets *net, struct sctp_tmit_chunk *tp1) { net->cc_mod.rtcc.bw_bytes += tp1->send_size; } static void sctp_cwnd_prepare_rtcc_net_for_sack(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net) { if (net->cc_mod.rtcc.tls_needs_set > 0) { /* We had a bw measurment going on */ struct timeval ltls; SCTP_GETPTIME_TIMEVAL(<ls); timevalsub(<ls, &net->cc_mod.rtcc.tls); net->cc_mod.rtcc.new_tot_time = (ltls.tv_sec * 1000000) + ltls.tv_usec; } } static void sctp_cwnd_new_rtcc_transmission_begins(struct sctp_tcb *stcb, struct sctp_nets *net) { uint64_t vtag, probepoint; if (net->cc_mod.rtcc.lbw) { /* Clear the old bw.. we went to 0 in-flight */ vtag = (net->rtt << 32) | (((uint32_t) (stcb->sctp_ep->sctp_lport)) << 16) | (stcb->rport); probepoint = (((uint64_t) net->cwnd) << 32); /* Probe point 8 */ probepoint |= ((8 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | 0), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), net->flight_size, probepoint); net->cc_mod.rtcc.lbw_rtt = 0; net->cc_mod.rtcc.cwnd_at_bw_set = 0; net->cc_mod.rtcc.lbw = 0; net->cc_mod.rtcc.bw_bytes_at_last_rttc = 0; net->cc_mod.rtcc.vol_reduce = 0; net->cc_mod.rtcc.bw_tot_time = 0; net->cc_mod.rtcc.bw_bytes = 0; net->cc_mod.rtcc.tls_needs_set = 0; if (net->cc_mod.rtcc.steady_step) { net->cc_mod.rtcc.vol_reduce = 0; net->cc_mod.rtcc.step_cnt = 0; net->cc_mod.rtcc.last_step_state = 0; } if (net->cc_mod.rtcc.ret_from_eq) { /* less aggressive one - reset cwnd too */ uint32_t cwnd_in_mtu, cwnd; cwnd_in_mtu = SCTP_BASE_SYSCTL(sctp_initial_cwnd); if (cwnd_in_mtu == 0) { /* * Using 0 means that the value of RFC 4960 * is used. */ cwnd = min((net->mtu * 4), max((2 * net->mtu), SCTP_INITIAL_CWND)); } else { /* * We take the minimum of the burst limit * and the initial congestion window. */ if ((stcb->asoc.max_burst > 0) && (cwnd_in_mtu > stcb->asoc.max_burst)) cwnd_in_mtu = stcb->asoc.max_burst; cwnd = (net->mtu - sizeof(struct sctphdr)) * cwnd_in_mtu; } if (net->cwnd > cwnd) { /* * Only set if we are not a timeout (i.e. * down to 1 mtu) */ net->cwnd = cwnd; } } } } static void sctp_set_rtcc_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net) { uint64_t vtag, probepoint; sctp_set_initial_cc_param(stcb, net); stcb->asoc.use_precise_time = 1; probepoint = (((uint64_t) net->cwnd) << 32); probepoint |= ((9 << 16) | 0); vtag = (net->rtt << 32) | (((uint32_t) (stcb->sctp_ep->sctp_lport)) << 16) | (stcb->rport); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, 0, 0, 0, probepoint); net->cc_mod.rtcc.lbw_rtt = 0; net->cc_mod.rtcc.cwnd_at_bw_set = 0; net->cc_mod.rtcc.vol_reduce = 0; net->cc_mod.rtcc.lbw = 0; net->cc_mod.rtcc.vol_reduce = 0; net->cc_mod.rtcc.bw_bytes_at_last_rttc = 0; net->cc_mod.rtcc.bw_tot_time = 0; net->cc_mod.rtcc.bw_bytes = 0; net->cc_mod.rtcc.tls_needs_set = 0; net->cc_mod.rtcc.ret_from_eq = SCTP_BASE_SYSCTL(sctp_rttvar_eqret); net->cc_mod.rtcc.steady_step = SCTP_BASE_SYSCTL(sctp_steady_step); net->cc_mod.rtcc.use_dccc_ecn = SCTP_BASE_SYSCTL(sctp_use_dccc_ecn); net->cc_mod.rtcc.step_cnt = 0; net->cc_mod.rtcc.last_step_state = 0; } static int sctp_cwnd_rtcc_socket_option(struct sctp_tcb *stcb, int setorget, struct sctp_cc_option *cc_opt) { struct sctp_nets *net; if (setorget == 1) { /* a set */ if (cc_opt->option == SCTP_CC_OPT_RTCC_SETMODE) { if ((cc_opt->aid_value.assoc_value != 0) && (cc_opt->aid_value.assoc_value != 1)) { return (EINVAL); } TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { net->cc_mod.rtcc.ret_from_eq = cc_opt->aid_value.assoc_value; } } else if (cc_opt->option == SCTP_CC_OPT_USE_DCCC_ECN) { if ((cc_opt->aid_value.assoc_value != 0) && (cc_opt->aid_value.assoc_value != 1)) { return (EINVAL); } TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { net->cc_mod.rtcc.use_dccc_ecn = cc_opt->aid_value.assoc_value; } } else if (cc_opt->option == SCTP_CC_OPT_STEADY_STEP) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { net->cc_mod.rtcc.steady_step = cc_opt->aid_value.assoc_value; } } else { return (EINVAL); } } else { /* a get */ if (cc_opt->option == SCTP_CC_OPT_RTCC_SETMODE) { net = TAILQ_FIRST(&stcb->asoc.nets); if (net == NULL) { return (EFAULT); } cc_opt->aid_value.assoc_value = net->cc_mod.rtcc.ret_from_eq; } else if (cc_opt->option == SCTP_CC_OPT_USE_DCCC_ECN) { net = TAILQ_FIRST(&stcb->asoc.nets); if (net == NULL) { return (EFAULT); } cc_opt->aid_value.assoc_value = net->cc_mod.rtcc.use_dccc_ecn; } else if (cc_opt->option == SCTP_CC_OPT_STEADY_STEP) { net = TAILQ_FIRST(&stcb->asoc.nets); if (net == NULL) { return (EFAULT); } cc_opt->aid_value.assoc_value = net->cc_mod.rtcc.steady_step; } else { return (EINVAL); } } return (0); } static void sctp_cwnd_update_rtcc_packet_transmitted(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net) { if (net->cc_mod.rtcc.tls_needs_set == 0) { SCTP_GETPTIME_TIMEVAL(&net->cc_mod.rtcc.tls); net->cc_mod.rtcc.tls_needs_set = 2; } } static void sctp_cwnd_update_rtcc_after_sack(struct sctp_tcb *stcb, struct sctp_association *asoc, int accum_moved, int reneged_all, int will_exit) { /* Passing a one argument at the last enables the rtcc algoritm */ sctp_cwnd_update_after_sack_common(stcb, asoc, accum_moved, reneged_all, will_exit, 1); } static void sctp_rtt_rtcc_calculated(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_nets *net, struct timeval *now SCTP_UNUSED) { net->cc_mod.rtcc.rtt_set_this_sack = 1; } /* Here starts Sally Floyds HS-TCP */ struct sctp_hs_raise_drop { int32_t cwnd; int32_t increase; int32_t drop_percent; }; #define SCTP_HS_TABLE_SIZE 73 struct sctp_hs_raise_drop sctp_cwnd_adjust[SCTP_HS_TABLE_SIZE] = { {38, 1, 50}, /* 0 */ {118, 2, 44}, /* 1 */ {221, 3, 41}, /* 2 */ {347, 4, 38}, /* 3 */ {495, 5, 37}, /* 4 */ {663, 6, 35}, /* 5 */ {851, 7, 34}, /* 6 */ {1058, 8, 33}, /* 7 */ {1284, 9, 32}, /* 8 */ {1529, 10, 31}, /* 9 */ {1793, 11, 30}, /* 10 */ {2076, 12, 29}, /* 11 */ {2378, 13, 28}, /* 12 */ {2699, 14, 28}, /* 13 */ {3039, 15, 27}, /* 14 */ {3399, 16, 27}, /* 15 */ {3778, 17, 26}, /* 16 */ {4177, 18, 26}, /* 17 */ {4596, 19, 25}, /* 18 */ {5036, 20, 25}, /* 19 */ {5497, 21, 24}, /* 20 */ {5979, 22, 24}, /* 21 */ {6483, 23, 23}, /* 22 */ {7009, 24, 23}, /* 23 */ {7558, 25, 22}, /* 24 */ {8130, 26, 22}, /* 25 */ {8726, 27, 22}, /* 26 */ {9346, 28, 21}, /* 27 */ {9991, 29, 21}, /* 28 */ {10661, 30, 21}, /* 29 */ {11358, 31, 20}, /* 30 */ {12082, 32, 20}, /* 31 */ {12834, 33, 20}, /* 32 */ {13614, 34, 19}, /* 33 */ {14424, 35, 19}, /* 34 */ {15265, 36, 19}, /* 35 */ {16137, 37, 19}, /* 36 */ {17042, 38, 18}, /* 37 */ {17981, 39, 18}, /* 38 */ {18955, 40, 18}, /* 39 */ {19965, 41, 17}, /* 40 */ {21013, 42, 17}, /* 41 */ {22101, 43, 17}, /* 42 */ {23230, 44, 17}, /* 43 */ {24402, 45, 16}, /* 44 */ {25618, 46, 16}, /* 45 */ {26881, 47, 16}, /* 46 */ {28193, 48, 16}, /* 47 */ {29557, 49, 15}, /* 48 */ {30975, 50, 15}, /* 49 */ {32450, 51, 15}, /* 50 */ {33986, 52, 15}, /* 51 */ {35586, 53, 14}, /* 52 */ {37253, 54, 14}, /* 53 */ {38992, 55, 14}, /* 54 */ {40808, 56, 14}, /* 55 */ {42707, 57, 13}, /* 56 */ {44694, 58, 13}, /* 57 */ {46776, 59, 13}, /* 58 */ {48961, 60, 13}, /* 59 */ {51258, 61, 13}, /* 60 */ {53677, 62, 12}, /* 61 */ {56230, 63, 12}, /* 62 */ {58932, 64, 12}, /* 63 */ {61799, 65, 12}, /* 64 */ {64851, 66, 11}, /* 65 */ {68113, 67, 11}, /* 66 */ {71617, 68, 11}, /* 67 */ {75401, 69, 10}, /* 68 */ {79517, 70, 10}, /* 69 */ {84035, 71, 10}, /* 70 */ {89053, 72, 10}, /* 71 */ {94717, 73, 9} /* 72 */ }; static void sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net) { int cur_val, i, indx, incr; int old_cwnd = net->cwnd; cur_val = net->cwnd >> 10; indx = SCTP_HS_TABLE_SIZE - 1; if (cur_val < sctp_cwnd_adjust[0].cwnd) { /* normal mode */ if (net->net_ack > net->mtu) { net->cwnd += net->mtu; } else { net->cwnd += net->net_ack; } } else { for (i = net->last_hs_used; i < SCTP_HS_TABLE_SIZE; i++) { if (cur_val < sctp_cwnd_adjust[i].cwnd) { indx = i; break; } } net->last_hs_used = indx; incr = ((sctp_cwnd_adjust[indx].increase) << 10); net->cwnd += incr; } sctp_enforce_cwnd_limit(&stcb->asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SS); } } static void sctp_hs_cwnd_decrease(struct sctp_tcb *stcb, struct sctp_nets *net) { int cur_val, i, indx; int old_cwnd = net->cwnd; cur_val = net->cwnd >> 10; if (cur_val < sctp_cwnd_adjust[0].cwnd) { /* normal mode */ net->ssthresh = net->cwnd / 2; if (net->ssthresh < (net->mtu * 2)) { net->ssthresh = 2 * net->mtu; } net->cwnd = net->ssthresh; } else { /* drop by the proper amount */ net->ssthresh = net->cwnd - (int)((net->cwnd / 100) * sctp_cwnd_adjust[net->last_hs_used].drop_percent); net->cwnd = net->ssthresh; /* now where are we */ indx = net->last_hs_used; cur_val = net->cwnd >> 10; /* reset where we are in the table */ if (cur_val < sctp_cwnd_adjust[0].cwnd) { /* feel out of hs */ net->last_hs_used = 0; } else { for (i = indx; i >= 1; i--) { if (cur_val > sctp_cwnd_adjust[i - 1].cwnd) { break; } } net->last_hs_used = indx; } } sctp_enforce_cwnd_limit(&stcb->asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_FR); } } static void sctp_hs_cwnd_update_after_fr(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_nets *net; /* * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off > 0) && * (net->fast_retran_loss_recovery == 0))) */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if ((asoc->fast_retran_loss_recovery == 0) || (asoc->sctp_cmt_on_off > 0)) { /* out of a RFC2582 Fast recovery window? */ if (net->net_ack > 0) { /* * per section 7.2.3, are there any * destinations that had a fast retransmit * to them. If so what we need to do is * adjust ssthresh and cwnd. */ struct sctp_tmit_chunk *lchk; sctp_hs_cwnd_decrease(stcb, net); lchk = TAILQ_FIRST(&asoc->send_queue); net->partial_bytes_acked = 0; /* Turn on fast recovery window */ asoc->fast_retran_loss_recovery = 1; if (lchk == NULL) { /* Mark end of the window */ asoc->fast_recovery_tsn = asoc->sending_seq - 1; } else { asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; } /* * CMT fast recovery -- per destination * recovery variable. */ net->fast_retran_loss_recovery = 1; if (lchk == NULL) { /* Mark end of the window */ net->fast_recovery_tsn = asoc->sending_seq - 1; } else { net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; } sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_2); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net); } } else if (net->net_ack > 0) { /* * Mark a peg that we WOULD have done a cwnd * reduction but RFC2582 prevented this action. */ SCTP_STAT_INCR(sctps_fastretransinrtt); } } } static void sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb, struct sctp_association *asoc, int accum_moved, int reneged_all SCTP_UNUSED, int will_exit) { struct sctp_nets *net; /******************************/ /* update cwnd and Early FR */ /******************************/ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { #ifdef JANA_CMT_FAST_RECOVERY /* * CMT fast recovery code. Need to debug. */ if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) { if (SCTP_TSN_GE(asoc->last_acked_seq, net->fast_recovery_tsn) || SCTP_TSN_GE(net->pseudo_cumack, net->fast_recovery_tsn)) { net->will_exit_fast_recovery = 1; } } #endif /* if nothing was acked on this destination skip it */ if (net->net_ack == 0) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK); } continue; } #ifdef JANA_CMT_FAST_RECOVERY /* * CMT fast recovery code */ /* * if (sctp_cmt_on_off > 0 && net->fast_retran_loss_recovery * && net->will_exit_fast_recovery == 0) { @@@ Do something * } else if (sctp_cmt_on_off == 0 && * asoc->fast_retran_loss_recovery && will_exit == 0) { */ #endif if (asoc->fast_retran_loss_recovery && (will_exit == 0) && (asoc->sctp_cmt_on_off == 0)) { /* * If we are in loss recovery we skip any cwnd * update */ return; } /* * CMT: CUC algorithm. Update cwnd if pseudo-cumack has * moved. */ if (accum_moved || ((asoc->sctp_cmt_on_off > 0) && net->new_pseudo_cumack)) { /* If the cumulative ack moved we can proceed */ if (net->cwnd <= net->ssthresh) { /* We are in slow start */ if (net->flight_size + net->net_ack >= net->cwnd) { sctp_hs_cwnd_increase(stcb, net); } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_NOADV_SS); } } } else { /* We are in congestion avoidance */ net->partial_bytes_acked += net->net_ack; if ((net->flight_size + net->net_ack >= net->cwnd) && (net->partial_bytes_acked >= net->cwnd)) { net->partial_bytes_acked -= net->cwnd; net->cwnd += net->mtu; sctp_enforce_cwnd_limit(asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_CA); } } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_NOADV_CA); } } } } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_NO_CUMACK); } } } } /* * H-TCP congestion control. The algorithm is detailed in: * R.N.Shorten, D.J.Leith: * "H-TCP: TCP for high-speed and long-distance networks" * Proc. PFLDnet, Argonne, 2004. * http://www.hamilton.ie/net/htcp3.pdf */ static int use_rtt_scaling = 1; static int use_bandwidth_switch = 1; static inline int between(uint32_t seq1, uint32_t seq2, uint32_t seq3) { return (seq3 - seq2 >= seq1 - seq2); } static inline uint32_t htcp_cong_time(struct htcp *ca) { return (sctp_get_tick_count() - ca->last_cong); } static inline uint32_t htcp_ccount(struct htcp *ca) { return (htcp_cong_time(ca) / ca->minRTT); } static inline void htcp_reset(struct htcp *ca) { ca->undo_last_cong = ca->last_cong; ca->undo_maxRTT = ca->maxRTT; ca->undo_old_maxB = ca->old_maxB; ca->last_cong = sctp_get_tick_count(); } #ifdef SCTP_NOT_USED static uint32_t htcp_cwnd_undo(struct sctp_tcb *stcb, struct sctp_nets *net) { net->cc_mod.htcp_ca.last_cong = net->cc_mod.htcp_ca.undo_last_cong; net->cc_mod.htcp_ca.maxRTT = net->cc_mod.htcp_ca.undo_maxRTT; net->cc_mod.htcp_ca.old_maxB = net->cc_mod.htcp_ca.undo_old_maxB; return (max(net->cwnd, ((net->ssthresh / net->mtu << 7) / net->cc_mod.htcp_ca.beta) * net->mtu)); } #endif static inline void measure_rtt(struct sctp_nets *net) { uint32_t srtt = net->lastsa >> SCTP_RTT_SHIFT; /* keep track of minimum RTT seen so far, minRTT is zero at first */ if (net->cc_mod.htcp_ca.minRTT > srtt || !net->cc_mod.htcp_ca.minRTT) net->cc_mod.htcp_ca.minRTT = srtt; /* max RTT */ if (net->fast_retran_ip == 0 && net->ssthresh < 0xFFFF && htcp_ccount(&net->cc_mod.htcp_ca) > 3) { if (net->cc_mod.htcp_ca.maxRTT < net->cc_mod.htcp_ca.minRTT) net->cc_mod.htcp_ca.maxRTT = net->cc_mod.htcp_ca.minRTT; if (net->cc_mod.htcp_ca.maxRTT < srtt && srtt <= net->cc_mod.htcp_ca.maxRTT + MSEC_TO_TICKS(20)) net->cc_mod.htcp_ca.maxRTT = srtt; } } static void measure_achieved_throughput(struct sctp_nets *net) { uint32_t now = sctp_get_tick_count(); if (net->fast_retran_ip == 0) net->cc_mod.htcp_ca.bytes_acked = net->net_ack; if (!use_bandwidth_switch) return; /* achieved throughput calculations */ /* JRS - not 100% sure of this statement */ if (net->fast_retran_ip == 1) { net->cc_mod.htcp_ca.bytecount = 0; net->cc_mod.htcp_ca.lasttime = now; return; } net->cc_mod.htcp_ca.bytecount += net->net_ack; if ((net->cc_mod.htcp_ca.bytecount >= net->cwnd - (((net->cc_mod.htcp_ca.alpha >> 7) ? (net->cc_mod.htcp_ca.alpha >> 7) : 1) * net->mtu)) && (now - net->cc_mod.htcp_ca.lasttime >= net->cc_mod.htcp_ca.minRTT) && (net->cc_mod.htcp_ca.minRTT > 0)) { uint32_t cur_Bi = net->cc_mod.htcp_ca.bytecount / net->mtu * hz / (now - net->cc_mod.htcp_ca.lasttime); if (htcp_ccount(&net->cc_mod.htcp_ca) <= 3) { /* just after backoff */ net->cc_mod.htcp_ca.minB = net->cc_mod.htcp_ca.maxB = net->cc_mod.htcp_ca.Bi = cur_Bi; } else { net->cc_mod.htcp_ca.Bi = (3 * net->cc_mod.htcp_ca.Bi + cur_Bi) / 4; if (net->cc_mod.htcp_ca.Bi > net->cc_mod.htcp_ca.maxB) net->cc_mod.htcp_ca.maxB = net->cc_mod.htcp_ca.Bi; if (net->cc_mod.htcp_ca.minB > net->cc_mod.htcp_ca.maxB) net->cc_mod.htcp_ca.minB = net->cc_mod.htcp_ca.maxB; } net->cc_mod.htcp_ca.bytecount = 0; net->cc_mod.htcp_ca.lasttime = now; } } static inline void htcp_beta_update(struct htcp *ca, uint32_t minRTT, uint32_t maxRTT) { if (use_bandwidth_switch) { uint32_t maxB = ca->maxB; uint32_t old_maxB = ca->old_maxB; ca->old_maxB = ca->maxB; if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { ca->beta = BETA_MIN; ca->modeswitch = 0; return; } } if (ca->modeswitch && minRTT > (uint32_t) MSEC_TO_TICKS(10) && maxRTT) { ca->beta = (minRTT << 7) / maxRTT; if (ca->beta < BETA_MIN) ca->beta = BETA_MIN; else if (ca->beta > BETA_MAX) ca->beta = BETA_MAX; } else { ca->beta = BETA_MIN; ca->modeswitch = 1; } } static inline void htcp_alpha_update(struct htcp *ca) { uint32_t minRTT = ca->minRTT; uint32_t factor = 1; uint32_t diff = htcp_cong_time(ca); if (diff > (uint32_t) hz) { diff -= hz; factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / hz)) / hz; } if (use_rtt_scaling && minRTT) { uint32_t scale = (hz << 3) / (10 * minRTT); scale = min(max(scale, 1U << 2), 10U << 3); /* clamping ratio to * interval [0.5,10]<<3 */ factor = (factor << 3) / scale; if (!factor) factor = 1; } ca->alpha = 2 * factor * ((1 << 7) - ca->beta); if (!ca->alpha) ca->alpha = ALPHA_BASE; } /* After we have the rtt data to calculate beta, we'd still prefer to wait one * rtt before we adjust our beta to ensure we are working from a consistent * data. * * This function should be called when we hit a congestion event since only at * that point do we really have a real sense of maxRTT (the queues en route * were getting just too full now). */ static void htcp_param_update(struct sctp_nets *net) { uint32_t minRTT = net->cc_mod.htcp_ca.minRTT; uint32_t maxRTT = net->cc_mod.htcp_ca.maxRTT; htcp_beta_update(&net->cc_mod.htcp_ca, minRTT, maxRTT); htcp_alpha_update(&net->cc_mod.htcp_ca); /* * add slowly fading memory for maxRTT to accommodate routing * changes etc */ if (minRTT > 0 && maxRTT > minRTT) net->cc_mod.htcp_ca.maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100; } static uint32_t htcp_recalc_ssthresh(struct sctp_nets *net) { htcp_param_update(net); return (max(((net->cwnd / net->mtu * net->cc_mod.htcp_ca.beta) >> 7) * net->mtu, 2U * net->mtu)); } static void htcp_cong_avoid(struct sctp_tcb *stcb, struct sctp_nets *net) { /*- * How to handle these functions? * if (!tcp_is_cwnd_limited(sk, in_flight)) RRS - good question. * return; */ if (net->cwnd <= net->ssthresh) { /* We are in slow start */ if (net->flight_size + net->net_ack >= net->cwnd) { if (net->net_ack > (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable))) { net->cwnd += (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable)); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_SS); } } else { net->cwnd += net->net_ack; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_FROM_SS); } } sctp_enforce_cwnd_limit(&stcb->asoc, net); } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_NOADV_SS); } } } else { measure_rtt(net); /* * In dangerous area, increase slowly. In theory this is * net->cwnd += alpha / net->cwnd */ /* What is snd_cwnd_cnt?? */ if (((net->partial_bytes_acked / net->mtu * net->cc_mod.htcp_ca.alpha) >> 7) * net->mtu >= net->cwnd) { /*- * Does SCTP have a cwnd clamp? * if (net->snd_cwnd < net->snd_cwnd_clamp) - Nope (RRS). */ net->cwnd += net->mtu; net->partial_bytes_acked = 0; sctp_enforce_cwnd_limit(&stcb->asoc, net); htcp_alpha_update(&net->cc_mod.htcp_ca); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_CA); } } else { net->partial_bytes_acked += net->net_ack; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_NOADV_CA); } } net->cc_mod.htcp_ca.bytes_acked = net->mtu; } } #ifdef SCTP_NOT_USED /* Lower bound on congestion window. */ static uint32_t htcp_min_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net) { return (net->ssthresh); } #endif static void htcp_init(struct sctp_nets *net) { memset(&net->cc_mod.htcp_ca, 0, sizeof(struct htcp)); net->cc_mod.htcp_ca.alpha = ALPHA_BASE; net->cc_mod.htcp_ca.beta = BETA_MIN; net->cc_mod.htcp_ca.bytes_acked = net->mtu; net->cc_mod.htcp_ca.last_cong = sctp_get_tick_count(); } static void sctp_htcp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net) { /* * We take the max of the burst limit times a MTU or the * INITIAL_CWND. We then limit this to 4 MTU's of sending. */ net->cwnd = min((net->mtu * 4), max((2 * net->mtu), SCTP_INITIAL_CWND)); net->ssthresh = stcb->asoc.peers_rwnd; sctp_enforce_cwnd_limit(&stcb->asoc, net); htcp_init(net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { sctp_log_cwnd(stcb, net, 0, SCTP_CWND_INITIALIZATION); } } static void sctp_htcp_cwnd_update_after_sack(struct sctp_tcb *stcb, struct sctp_association *asoc, int accum_moved, int reneged_all SCTP_UNUSED, int will_exit) { struct sctp_nets *net; /******************************/ /* update cwnd and Early FR */ /******************************/ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { #ifdef JANA_CMT_FAST_RECOVERY /* * CMT fast recovery code. Need to debug. */ if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) { if (SCTP_TSN_GE(asoc->last_acked_seq, net->fast_recovery_tsn) || SCTP_TSN_GE(net->pseudo_cumack, net->fast_recovery_tsn)) { net->will_exit_fast_recovery = 1; } } #endif /* if nothing was acked on this destination skip it */ if (net->net_ack == 0) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK); } continue; } #ifdef JANA_CMT_FAST_RECOVERY /* * CMT fast recovery code */ /* * if (sctp_cmt_on_off > 0 && net->fast_retran_loss_recovery * && net->will_exit_fast_recovery == 0) { @@@ Do something * } else if (sctp_cmt_on_off == 0 && * asoc->fast_retran_loss_recovery && will_exit == 0) { */ #endif if (asoc->fast_retran_loss_recovery && will_exit == 0 && (asoc->sctp_cmt_on_off == 0)) { /* * If we are in loss recovery we skip any cwnd * update */ return; } /* * CMT: CUC algorithm. Update cwnd if pseudo-cumack has * moved. */ if (accum_moved || ((asoc->sctp_cmt_on_off > 0) && net->new_pseudo_cumack)) { htcp_cong_avoid(stcb, net); measure_achieved_throughput(net); } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_NO_CUMACK); } } } } static void sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_nets *net; /* * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off > 0) && * (net->fast_retran_loss_recovery == 0))) */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if ((asoc->fast_retran_loss_recovery == 0) || (asoc->sctp_cmt_on_off > 0)) { /* out of a RFC2582 Fast recovery window? */ if (net->net_ack > 0) { /* * per section 7.2.3, are there any * destinations that had a fast retransmit * to them. If so what we need to do is * adjust ssthresh and cwnd. */ struct sctp_tmit_chunk *lchk; int old_cwnd = net->cwnd; /* JRS - reset as if state were changed */ htcp_reset(&net->cc_mod.htcp_ca); net->ssthresh = htcp_recalc_ssthresh(net); net->cwnd = net->ssthresh; sctp_enforce_cwnd_limit(asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_FR); } lchk = TAILQ_FIRST(&asoc->send_queue); net->partial_bytes_acked = 0; /* Turn on fast recovery window */ asoc->fast_retran_loss_recovery = 1; if (lchk == NULL) { /* Mark end of the window */ asoc->fast_recovery_tsn = asoc->sending_seq - 1; } else { asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; } /* * CMT fast recovery -- per destination * recovery variable. */ net->fast_retran_loss_recovery = 1; if (lchk == NULL) { /* Mark end of the window */ net->fast_recovery_tsn = asoc->sending_seq - 1; } else { net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; } sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_3); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net); } } else if (net->net_ack > 0) { /* * Mark a peg that we WOULD have done a cwnd * reduction but RFC2582 prevented this action. */ SCTP_STAT_INCR(sctps_fastretransinrtt); } } } static void sctp_htcp_cwnd_update_after_timeout(struct sctp_tcb *stcb, struct sctp_nets *net) { int old_cwnd = net->cwnd; /* JRS - reset as if the state were being changed to timeout */ htcp_reset(&net->cc_mod.htcp_ca); net->ssthresh = htcp_recalc_ssthresh(net); net->cwnd = net->mtu; net->partial_bytes_acked = 0; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->cwnd - old_cwnd, SCTP_CWND_LOG_FROM_RTX); } } static void sctp_htcp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net, int in_window, int num_pkt_lost SCTP_UNUSED) { int old_cwnd; old_cwnd = net->cwnd; /* JRS - reset hctp as if state changed */ if (in_window == 0) { htcp_reset(&net->cc_mod.htcp_ca); SCTP_STAT_INCR(sctps_ecnereducedcwnd); net->ssthresh = htcp_recalc_ssthresh(net); if (net->ssthresh < net->mtu) { net->ssthresh = net->mtu; /* here back off the timer as well, to slow us down */ net->RTO <<= 1; } net->cwnd = net->ssthresh; sctp_enforce_cwnd_limit(&stcb->asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); } } } struct sctp_cc_functions sctp_cc_functions[] = { { .sctp_set_initial_cc_param = sctp_set_initial_cc_param, .sctp_cwnd_update_after_sack = sctp_cwnd_update_after_sack, .sctp_cwnd_update_exit_pf = sctp_cwnd_update_exit_pf_common, .sctp_cwnd_update_after_fr = sctp_cwnd_update_after_fr, .sctp_cwnd_update_after_timeout = sctp_cwnd_update_after_timeout, .sctp_cwnd_update_after_ecn_echo = sctp_cwnd_update_after_ecn_echo, .sctp_cwnd_update_after_packet_dropped = sctp_cwnd_update_after_packet_dropped, .sctp_cwnd_update_after_output = sctp_cwnd_update_after_output, }, { .sctp_set_initial_cc_param = sctp_set_initial_cc_param, .sctp_cwnd_update_after_sack = sctp_hs_cwnd_update_after_sack, .sctp_cwnd_update_exit_pf = sctp_cwnd_update_exit_pf_common, .sctp_cwnd_update_after_fr = sctp_hs_cwnd_update_after_fr, .sctp_cwnd_update_after_timeout = sctp_cwnd_update_after_timeout, .sctp_cwnd_update_after_ecn_echo = sctp_cwnd_update_after_ecn_echo, .sctp_cwnd_update_after_packet_dropped = sctp_cwnd_update_after_packet_dropped, .sctp_cwnd_update_after_output = sctp_cwnd_update_after_output, }, { .sctp_set_initial_cc_param = sctp_htcp_set_initial_cc_param, .sctp_cwnd_update_after_sack = sctp_htcp_cwnd_update_after_sack, .sctp_cwnd_update_exit_pf = sctp_cwnd_update_exit_pf_common, .sctp_cwnd_update_after_fr = sctp_htcp_cwnd_update_after_fr, .sctp_cwnd_update_after_timeout = sctp_htcp_cwnd_update_after_timeout, .sctp_cwnd_update_after_ecn_echo = sctp_htcp_cwnd_update_after_ecn_echo, .sctp_cwnd_update_after_packet_dropped = sctp_cwnd_update_after_packet_dropped, .sctp_cwnd_update_after_output = sctp_cwnd_update_after_output, }, { .sctp_set_initial_cc_param = sctp_set_rtcc_initial_cc_param, .sctp_cwnd_update_after_sack = sctp_cwnd_update_rtcc_after_sack, .sctp_cwnd_update_exit_pf = sctp_cwnd_update_exit_pf_common, .sctp_cwnd_update_after_fr = sctp_cwnd_update_after_fr, .sctp_cwnd_update_after_timeout = sctp_cwnd_update_after_timeout, .sctp_cwnd_update_after_ecn_echo = sctp_cwnd_update_rtcc_after_ecn_echo, .sctp_cwnd_update_after_packet_dropped = sctp_cwnd_update_after_packet_dropped, .sctp_cwnd_update_after_output = sctp_cwnd_update_after_output, .sctp_cwnd_update_packet_transmitted = sctp_cwnd_update_rtcc_packet_transmitted, .sctp_cwnd_update_tsn_acknowledged = sctp_cwnd_update_rtcc_tsn_acknowledged, .sctp_cwnd_new_transmission_begins = sctp_cwnd_new_rtcc_transmission_begins, .sctp_cwnd_prepare_net_for_sack = sctp_cwnd_prepare_rtcc_net_for_sack, .sctp_cwnd_socket_option = sctp_cwnd_rtcc_socket_option, .sctp_rtt_calculated = sctp_rtt_rtcc_calculated } }; Index: head/sys/security/mac/mac_framework.c =================================================================== --- head/sys/security/mac/mac_framework.c (revision 292383) +++ head/sys/security/mac/mac_framework.c (revision 292384) @@ -1,596 +1,596 @@ /*- * Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2005 Networks Associates Technology, Inc. * Copyright (c) 2005-2006 SPARTA, Inc. * Copyright (c) 2008-2009 Apple Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Framework for extensible kernel access control. This file contains core * kernel infrastructure for the TrustedBSD MAC Framework, including policy * registration, versioning, locking, error composition operator, and system * calls. * * The MAC Framework implements three programming interfaces: * * - The kernel MAC interface, defined in mac_framework.h, and invoked * throughout the kernel to request security decisions, notify of security * related events, etc. * * - The MAC policy module interface, defined in mac_policy.h, which is * implemented by MAC policy modules and invoked by the MAC Framework to * forward kernel security requests and notifications to policy modules. * * - The user MAC API, defined in mac.h, which allows user programs to query * and set label state on objects. * * The majority of the MAC Framework implementation may be found in * src/sys/security/mac. Sample policy modules may be found in * src/sys/security/mac_*. */ #include "opt_mac.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * DTrace SDT providers for MAC. */ SDT_PROVIDER_DEFINE(mac); SDT_PROVIDER_DEFINE(mac_framework); -SDT_PROBE_DEFINE2(mac, kernel, policy, modevent, "int", +SDT_PROBE_DEFINE2(mac, , policy, modevent, "int", "struct mac_policy_conf *"); -SDT_PROBE_DEFINE1(mac, kernel, policy, register, +SDT_PROBE_DEFINE1(mac, , policy, register, "struct mac_policy_conf *"); -SDT_PROBE_DEFINE1(mac, kernel, policy, unregister, +SDT_PROBE_DEFINE1(mac, , policy, unregister, "struct mac_policy_conf *"); /* * Root sysctl node for all MAC and MAC policy controls. */ SYSCTL_NODE(_security, OID_AUTO, mac, CTLFLAG_RW, 0, "TrustedBSD MAC policy controls"); /* * Declare that the kernel provides MAC support, version 3 (FreeBSD 7.x). * This permits modules to refuse to be loaded if the necessary support isn't * present, even if it's pre-boot. */ MODULE_VERSION(kernel_mac_support, MAC_VERSION); static unsigned int mac_version = MAC_VERSION; SYSCTL_UINT(_security_mac, OID_AUTO, version, CTLFLAG_RD, &mac_version, 0, ""); /* * Labels consist of a indexed set of "slots", which are allocated policies * as required. The MAC Framework maintains a bitmask of slots allocated so * far to prevent reuse. Slots cannot be reused, as the MAC Framework * guarantees that newly allocated slots in labels will be NULL unless * otherwise initialized, and because we do not have a mechanism to garbage * collect slots on policy unload. As labeled policies tend to be statically * loaded during boot, and not frequently unloaded and reloaded, this is not * generally an issue. */ #if MAC_MAX_SLOTS > 32 #error "MAC_MAX_SLOTS too large" #endif static unsigned int mac_max_slots = MAC_MAX_SLOTS; static unsigned int mac_slot_offsets_free = (1 << MAC_MAX_SLOTS) - 1; SYSCTL_UINT(_security_mac, OID_AUTO, max_slots, CTLFLAG_RD, &mac_max_slots, 0, ""); /* * Has the kernel started generating labeled objects yet? All read/write * access to this variable is serialized during the boot process. Following * the end of serialization, we don't update this flag; no locking. */ static int mac_late = 0; /* * Each policy declares a mask of object types requiring labels to be * allocated for them. For convenience, we combine and cache the bitwise or * of the per-policy object flags to track whether we will allocate a label * for an object type at run-time. */ uint64_t mac_labeled; SYSCTL_UQUAD(_security_mac, OID_AUTO, labeled, CTLFLAG_RD, &mac_labeled, 0, "Mask of object types being labeled"); MALLOC_DEFINE(M_MACTEMP, "mactemp", "MAC temporary label storage"); /* * MAC policy modules are placed in one of two lists: mac_static_policy_list, * for policies that are loaded early and cannot be unloaded, and * mac_policy_list, which holds policies either loaded later in the boot * cycle or that may be unloaded. The static policy list does not require * locks to iterate over, but the dynamic list requires synchronization. * Support for dynamic policy loading can be compiled out using the * MAC_STATIC kernel option. * * The dynamic policy list is protected by two locks: modifying the list * requires both locks to be held exclusively. One of the locks, * mac_policy_rm, is acquired over policy entry points that will never sleep; * the other, mac_policy_sx, is acquire over policy entry points that may * sleep. The former category will be used when kernel locks may be held * over calls to the MAC Framework, during network processing in ithreads, * etc. The latter will tend to involve potentially blocking memory * allocations, extended attribute I/O, etc. */ #ifndef MAC_STATIC static struct rmlock mac_policy_rm; /* Non-sleeping entry points. */ static struct sx mac_policy_sx; /* Sleeping entry points. */ #endif struct mac_policy_list_head mac_policy_list; struct mac_policy_list_head mac_static_policy_list; u_int mac_policy_count; /* Registered policy count. */ static void mac_policy_xlock(void); static void mac_policy_xlock_assert(void); static void mac_policy_xunlock(void); void mac_policy_slock_nosleep(struct rm_priotracker *tracker) { #ifndef MAC_STATIC if (!mac_late) return; rm_rlock(&mac_policy_rm, tracker); #endif } void mac_policy_slock_sleep(void) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "mac_policy_slock_sleep"); #ifndef MAC_STATIC if (!mac_late) return; sx_slock(&mac_policy_sx); #endif } void mac_policy_sunlock_nosleep(struct rm_priotracker *tracker) { #ifndef MAC_STATIC if (!mac_late) return; rm_runlock(&mac_policy_rm, tracker); #endif } void mac_policy_sunlock_sleep(void) { #ifndef MAC_STATIC if (!mac_late) return; sx_sunlock(&mac_policy_sx); #endif } static void mac_policy_xlock(void) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "mac_policy_xlock()"); #ifndef MAC_STATIC if (!mac_late) return; sx_xlock(&mac_policy_sx); rm_wlock(&mac_policy_rm); #endif } static void mac_policy_xunlock(void) { #ifndef MAC_STATIC if (!mac_late) return; rm_wunlock(&mac_policy_rm); sx_xunlock(&mac_policy_sx); #endif } static void mac_policy_xlock_assert(void) { #ifndef MAC_STATIC if (!mac_late) return; /* XXXRW: rm_assert(&mac_policy_rm, RA_WLOCKED); */ sx_assert(&mac_policy_sx, SA_XLOCKED); #endif } /* * Initialize the MAC subsystem, including appropriate SMP locks. */ static void mac_init(void) { LIST_INIT(&mac_static_policy_list); LIST_INIT(&mac_policy_list); mac_labelzone_init(); #ifndef MAC_STATIC rm_init_flags(&mac_policy_rm, "mac_policy_rm", RM_NOWITNESS | RM_RECURSE); sx_init_flags(&mac_policy_sx, "mac_policy_sx", SX_NOWITNESS); #endif } /* * For the purposes of modules that want to know if they were loaded "early", * set the mac_late flag once we've processed modules either linked into the * kernel, or loaded before the kernel startup. */ static void mac_late_init(void) { mac_late = 1; } /* * Given a policy, derive from its set of non-NULL label init methods what * object types the policy is interested in. */ static uint64_t mac_policy_getlabeled(struct mac_policy_conf *mpc) { uint64_t labeled; #define MPC_FLAG(method, flag) \ if (mpc->mpc_ops->mpo_ ## method != NULL) \ labeled |= (flag); \ labeled = 0; MPC_FLAG(cred_init_label, MPC_OBJECT_CRED); MPC_FLAG(proc_init_label, MPC_OBJECT_PROC); MPC_FLAG(vnode_init_label, MPC_OBJECT_VNODE); MPC_FLAG(inpcb_init_label, MPC_OBJECT_INPCB); MPC_FLAG(socket_init_label, MPC_OBJECT_SOCKET); MPC_FLAG(devfs_init_label, MPC_OBJECT_DEVFS); MPC_FLAG(mbuf_init_label, MPC_OBJECT_MBUF); MPC_FLAG(ipq_init_label, MPC_OBJECT_IPQ); MPC_FLAG(ifnet_init_label, MPC_OBJECT_IFNET); MPC_FLAG(bpfdesc_init_label, MPC_OBJECT_BPFDESC); MPC_FLAG(pipe_init_label, MPC_OBJECT_PIPE); MPC_FLAG(mount_init_label, MPC_OBJECT_MOUNT); MPC_FLAG(posixsem_init_label, MPC_OBJECT_POSIXSEM); MPC_FLAG(posixshm_init_label, MPC_OBJECT_POSIXSHM); MPC_FLAG(sysvmsg_init_label, MPC_OBJECT_SYSVMSG); MPC_FLAG(sysvmsq_init_label, MPC_OBJECT_SYSVMSQ); MPC_FLAG(sysvsem_init_label, MPC_OBJECT_SYSVSEM); MPC_FLAG(sysvshm_init_label, MPC_OBJECT_SYSVSHM); MPC_FLAG(syncache_init_label, MPC_OBJECT_SYNCACHE); MPC_FLAG(ip6q_init_label, MPC_OBJECT_IP6Q); #undef MPC_FLAG return (labeled); } /* * When policies are loaded or unloaded, walk the list of registered policies * and built mac_labeled, a bitmask representing the union of all objects * requiring labels across all policies. */ static void mac_policy_update(void) { struct mac_policy_conf *mpc; mac_policy_xlock_assert(); mac_labeled = 0; mac_policy_count = 0; LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { mac_labeled |= mac_policy_getlabeled(mpc); mac_policy_count++; } LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { mac_labeled |= mac_policy_getlabeled(mpc); mac_policy_count++; } } static int mac_policy_register(struct mac_policy_conf *mpc) { struct mac_policy_conf *tmpc; int error, slot, static_entry; error = 0; /* * We don't technically need exclusive access while !mac_late, but * hold it for assertion consistency. */ mac_policy_xlock(); /* * If the module can potentially be unloaded, or we're loading late, * we have to stick it in the non-static list and pay an extra * performance overhead. Otherwise, we can pay a light locking cost * and stick it in the static list. */ static_entry = (!mac_late && !(mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK)); if (static_entry) { LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) { if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) { error = EEXIST; goto out; } } } else { LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) { if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) { error = EEXIST; goto out; } } } if (mpc->mpc_field_off != NULL) { slot = ffs(mac_slot_offsets_free); if (slot == 0) { error = ENOMEM; goto out; } slot--; mac_slot_offsets_free &= ~(1 << slot); *mpc->mpc_field_off = slot; } mpc->mpc_runtime_flags |= MPC_RUNTIME_FLAG_REGISTERED; /* * If we're loading a MAC module after the framework has initialized, * it has to go into the dynamic list. If we're loading it before * we've finished initializing, it can go into the static list with * weaker locker requirements. */ if (static_entry) LIST_INSERT_HEAD(&mac_static_policy_list, mpc, mpc_list); else LIST_INSERT_HEAD(&mac_policy_list, mpc, mpc_list); /* * Per-policy initialization. Currently, this takes place under the * exclusive lock, so policies must not sleep in their init method. * In the future, we may want to separate "init" from "start", with * "init" occuring without the lock held. Likewise, on tear-down, * breaking out "stop" from "destroy". */ if (mpc->mpc_ops->mpo_init != NULL) (*(mpc->mpc_ops->mpo_init))(mpc); mac_policy_update(); - SDT_PROBE(mac, kernel, policy, register, mpc, 0, 0, 0, 0); + SDT_PROBE1(mac, , policy, register, mpc); printf("Security policy loaded: %s (%s)\n", mpc->mpc_fullname, mpc->mpc_name); out: mac_policy_xunlock(); return (error); } static int mac_policy_unregister(struct mac_policy_conf *mpc) { /* * If we fail the load, we may get a request to unload. Check to see * if we did the run-time registration, and if not, silently succeed. */ mac_policy_xlock(); if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED) == 0) { mac_policy_xunlock(); return (0); } #if 0 /* * Don't allow unloading modules with private data. */ if (mpc->mpc_field_off != NULL) { mac_policy_xunlock(); return (EBUSY); } #endif /* * Only allow the unload to proceed if the module is unloadable by * its own definition. */ if ((mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK) == 0) { mac_policy_xunlock(); return (EBUSY); } if (mpc->mpc_ops->mpo_destroy != NULL) (*(mpc->mpc_ops->mpo_destroy))(mpc); LIST_REMOVE(mpc, mpc_list); mpc->mpc_runtime_flags &= ~MPC_RUNTIME_FLAG_REGISTERED; mac_policy_update(); mac_policy_xunlock(); - SDT_PROBE(mac, kernel, policy, unregister, mpc, 0, 0, 0, 0); + SDT_PROBE1(mac, , policy, unregister, mpc); printf("Security policy unload: %s (%s)\n", mpc->mpc_fullname, mpc->mpc_name); return (0); } /* * Allow MAC policy modules to register during boot, etc. */ int mac_policy_modevent(module_t mod, int type, void *data) { struct mac_policy_conf *mpc; int error; error = 0; mpc = (struct mac_policy_conf *) data; #ifdef MAC_STATIC if (mac_late) { printf("mac_policy_modevent: MAC_STATIC and late\n"); return (EBUSY); } #endif - SDT_PROBE(mac, kernel, policy, modevent, type, mpc, 0, 0, 0); + SDT_PROBE2(mac, , policy, modevent, type, mpc); switch (type) { case MOD_LOAD: if (mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_NOTLATE && mac_late) { printf("mac_policy_modevent: can't load %s policy " "after booting\n", mpc->mpc_name); error = EBUSY; break; } error = mac_policy_register(mpc); break; case MOD_UNLOAD: /* Don't unregister the module if it was never registered. */ if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED) != 0) error = mac_policy_unregister(mpc); else error = 0; break; default: error = EOPNOTSUPP; break; } return (error); } /* * Define an error value precedence, and given two arguments, selects the * value with the higher precedence. */ int mac_error_select(int error1, int error2) { /* Certain decision-making errors take top priority. */ if (error1 == EDEADLK || error2 == EDEADLK) return (EDEADLK); /* Invalid arguments should be reported where possible. */ if (error1 == EINVAL || error2 == EINVAL) return (EINVAL); /* Precedence goes to "visibility", with both process and file. */ if (error1 == ESRCH || error2 == ESRCH) return (ESRCH); if (error1 == ENOENT || error2 == ENOENT) return (ENOENT); /* Precedence goes to DAC/MAC protections. */ if (error1 == EACCES || error2 == EACCES) return (EACCES); /* Precedence goes to privilege. */ if (error1 == EPERM || error2 == EPERM) return (EPERM); /* Precedence goes to error over success; otherwise, arbitrary. */ if (error1 != 0) return (error1); return (error2); } int mac_check_structmac_consistent(struct mac *mac) { if (mac->m_buflen > MAC_MAX_LABEL_BUF_LEN) return (EINVAL); return (0); } SYSINIT(mac, SI_SUB_MAC, SI_ORDER_FIRST, mac_init, NULL); SYSINIT(mac_late, SI_SUB_MAC_LATE, SI_ORDER_FIRST, mac_late_init, NULL); Index: head/sys/security/mac/mac_internal.h =================================================================== --- head/sys/security/mac/mac_internal.h (revision 292383) +++ head/sys/security/mac/mac_internal.h (revision 292384) @@ -1,516 +1,516 @@ /*- * Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2004 Networks Associates Technology, Inc. * Copyright (c) 2006 nCircle Network Security, Inc. * Copyright (c) 2006 SPARTA, Inc. * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was developed by Robert N. M. Watson for the TrustedBSD * Project under contract to nCircle Network Security, Inc. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SECURITY_MAC_MAC_INTERNAL_H_ #define _SECURITY_MAC_MAC_INTERNAL_H_ #ifndef _KERNEL #error "no user-serviceable parts inside" #endif #include #include /* * MAC Framework sysctl namespace. */ #ifdef SYSCTL_DECL SYSCTL_DECL(_security_mac); #endif /* SYSCTL_DECL */ /* * MAC Framework SDT DTrace probe namespace, macros for declaring entry * point probes, macros for invoking them. */ #ifdef SDT_PROVIDER_DECLARE SDT_PROVIDER_DECLARE(mac); /* MAC Framework-level events. */ SDT_PROVIDER_DECLARE(mac_framework); /* Entry points to MAC. */ #define MAC_CHECK_PROBE_DEFINE4(name, arg0, arg1, arg2, arg3) \ - SDT_PROBE_DEFINE5(mac_framework, kernel, name, mac__check__err, \ + SDT_PROBE_DEFINE5(mac_framework, , name, mac__check__err, \ "int", arg0, arg1, arg2, arg3); \ - SDT_PROBE_DEFINE5(mac_framework, kernel, name, mac__check__ok, \ + SDT_PROBE_DEFINE5(mac_framework, , name, mac__check__ok, \ "int", arg0, arg1, arg2, arg3); #define MAC_CHECK_PROBE_DEFINE3(name, arg0, arg1, arg2) \ - SDT_PROBE_DEFINE4(mac_framework, kernel, name, mac__check__err, \ + SDT_PROBE_DEFINE4(mac_framework, , name, mac__check__err, \ "int", arg0, arg1, arg2); \ - SDT_PROBE_DEFINE4(mac_framework, kernel, name, mac__check__ok, \ + SDT_PROBE_DEFINE4(mac_framework, , name, mac__check__ok, \ "int", arg0, arg1, arg2); #define MAC_CHECK_PROBE_DEFINE2(name, arg0, arg1) \ - SDT_PROBE_DEFINE3(mac_framework, kernel, name, mac__check__err, \ + SDT_PROBE_DEFINE3(mac_framework, , name, mac__check__err, \ "int", arg0, arg1); \ - SDT_PROBE_DEFINE3(mac_framework, kernel, name, mac__check__ok, \ + SDT_PROBE_DEFINE3(mac_framework, , name, mac__check__ok, \ "int", arg0, arg1); #define MAC_CHECK_PROBE_DEFINE1(name, arg0) \ - SDT_PROBE_DEFINE2(mac_framework, kernel, name, mac__check__err, \ + SDT_PROBE_DEFINE2(mac_framework, , name, mac__check__err, \ "int", arg0); \ - SDT_PROBE_DEFINE2(mac_framework, kernel, name, mac__check__ok, \ + SDT_PROBE_DEFINE2(mac_framework, , name, mac__check__ok, \ "int", arg0); #define MAC_CHECK_PROBE4(name, error, arg0, arg1, arg2, arg3) do { \ if (error) { \ - SDT_PROBE(mac_framework, kernel, name, mac__check__err, \ + SDT_PROBE5(mac_framework, , name, mac__check__err, \ error, arg0, arg1, arg2, arg3); \ } else { \ - SDT_PROBE(mac_framework, kernel, name, mac__check__ok, \ + SDT_PROBE5(mac_framework, , name, mac__check__ok, \ 0, arg0, arg1, arg2, arg3); \ } \ } while (0) #define MAC_CHECK_PROBE3(name, error, arg0, arg1, arg2) \ MAC_CHECK_PROBE4(name, error, arg0, arg1, arg2, 0) #define MAC_CHECK_PROBE2(name, error, arg0, arg1) \ MAC_CHECK_PROBE3(name, error, arg0, arg1, 0) #define MAC_CHECK_PROBE1(name, error, arg0) \ MAC_CHECK_PROBE2(name, error, arg0, 0) #endif #define MAC_GRANT_PROBE_DEFINE2(name, arg0, arg1) \ - SDT_PROBE_DEFINE3(mac_framework, kernel, name, mac__grant__err, \ + SDT_PROBE_DEFINE3(mac_framework, , name, mac__grant__err, \ "int", arg0, arg1); \ - SDT_PROBE_DEFINE3(mac_framework, kernel, name, mac__grant__ok, \ + SDT_PROBE_DEFINE3(mac_framework, , name, mac__grant__ok, \ "int", arg0, arg1); #define MAC_GRANT_PROBE2(name, error, arg0, arg1) do { \ if (error) { \ - SDT_PROBE(mac_framework, kernel, name, mac__grant__err, \ - error, arg0, arg1, 0, 0); \ + SDT_PROBE3(mac_framework, , name, mac__grant__err, \ + error, arg0, arg1); \ } else { \ - SDT_PROBE(mac_framework, kernel, name, mac__grant__ok, \ - error, arg0, arg1, 0, 0); \ + SDT_PROBE3(mac_framework, , name, mac__grant__ok, \ + error, arg0, arg1); \ } \ } while (0) /* * MAC Framework global types and typedefs. */ LIST_HEAD(mac_policy_list_head, mac_policy_conf); #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_MACTEMP); #endif /* * MAC labels -- in-kernel storage format. * * In general, struct label pointers are embedded in kernel data structures * representing objects that may be labeled (and protected). Struct label is * opaque to both kernel services that invoke the MAC Framework and MAC * policy modules. In particular, we do not wish to encode the layout of the * label structure into any ABIs. Historically, the slot array contained * unions of {long, void} but now contains uintptr_t. */ #define MAC_MAX_SLOTS 4 #define MAC_FLAG_INITIALIZED 0x0000001 /* Is initialized for use. */ struct label { int l_flags; intptr_t l_perpolicy[MAC_MAX_SLOTS]; }; /* * Flags for mac_labeled, a bitmask of object types need across the union of * all policies currently registered with the MAC Framework, used to key * whether or not labels are allocated and constructors for the type are * invoked. */ #define MPC_OBJECT_CRED 0x0000000000000001 #define MPC_OBJECT_PROC 0x0000000000000002 #define MPC_OBJECT_VNODE 0x0000000000000004 #define MPC_OBJECT_INPCB 0x0000000000000008 #define MPC_OBJECT_SOCKET 0x0000000000000010 #define MPC_OBJECT_DEVFS 0x0000000000000020 #define MPC_OBJECT_MBUF 0x0000000000000040 #define MPC_OBJECT_IPQ 0x0000000000000080 #define MPC_OBJECT_IFNET 0x0000000000000100 #define MPC_OBJECT_BPFDESC 0x0000000000000200 #define MPC_OBJECT_PIPE 0x0000000000000400 #define MPC_OBJECT_MOUNT 0x0000000000000800 #define MPC_OBJECT_POSIXSEM 0x0000000000001000 #define MPC_OBJECT_POSIXSHM 0x0000000000002000 #define MPC_OBJECT_SYSVMSG 0x0000000000004000 #define MPC_OBJECT_SYSVMSQ 0x0000000000008000 #define MPC_OBJECT_SYSVSEM 0x0000000000010000 #define MPC_OBJECT_SYSVSHM 0x0000000000020000 #define MPC_OBJECT_SYNCACHE 0x0000000000040000 #define MPC_OBJECT_IP6Q 0x0000000000080000 /* * MAC Framework global variables. */ extern struct mac_policy_list_head mac_policy_list; extern struct mac_policy_list_head mac_static_policy_list; extern u_int mac_policy_count; extern uint64_t mac_labeled; extern struct mtx mac_ifnet_mtx; /* * MAC Framework infrastructure functions. */ int mac_error_select(int error1, int error2); void mac_policy_slock_nosleep(struct rm_priotracker *tracker); void mac_policy_slock_sleep(void); void mac_policy_sunlock_nosleep(struct rm_priotracker *tracker); void mac_policy_sunlock_sleep(void); struct label *mac_labelzone_alloc(int flags); void mac_labelzone_free(struct label *label); void mac_labelzone_init(void); void mac_init_label(struct label *label); void mac_destroy_label(struct label *label); int mac_check_structmac_consistent(struct mac *mac); int mac_allocate_slot(void); #define MAC_IFNET_LOCK(ifp) mtx_lock(&mac_ifnet_mtx) #define MAC_IFNET_UNLOCK(ifp) mtx_unlock(&mac_ifnet_mtx) /* * MAC Framework per-object type functions. It's not yet clear how the * namespaces, etc, should work for these, so for now, sort by object type. */ struct label *mac_cred_label_alloc(void); void mac_cred_label_free(struct label *label); struct label *mac_pipe_label_alloc(void); void mac_pipe_label_free(struct label *label); struct label *mac_socket_label_alloc(int flag); void mac_socket_label_free(struct label *label); struct label *mac_vnode_label_alloc(void); void mac_vnode_label_free(struct label *label); int mac_cred_check_relabel(struct ucred *cred, struct label *newlabel); int mac_cred_externalize_label(struct label *label, char *elements, char *outbuf, size_t outbuflen); int mac_cred_internalize_label(struct label *label, char *string); void mac_cred_relabel(struct ucred *cred, struct label *newlabel); struct label *mac_mbuf_to_label(struct mbuf *m); void mac_pipe_copy_label(struct label *src, struct label *dest); int mac_pipe_externalize_label(struct label *label, char *elements, char *outbuf, size_t outbuflen); int mac_pipe_internalize_label(struct label *label, char *string); int mac_socket_label_set(struct ucred *cred, struct socket *so, struct label *label); void mac_socket_copy_label(struct label *src, struct label *dest); int mac_socket_externalize_label(struct label *label, char *elements, char *outbuf, size_t outbuflen); int mac_socket_internalize_label(struct label *label, char *string); int mac_vnode_externalize_label(struct label *label, char *elements, char *outbuf, size_t outbuflen); int mac_vnode_internalize_label(struct label *label, char *string); void mac_vnode_check_mmap_downgrade(struct ucred *cred, struct vnode *vp, int *prot); int vn_setlabel(struct vnode *vp, struct label *intlabel, struct ucred *cred); /* * MAC Framework composition macros invoke all registered MAC policies for a * specific entry point. They come in two forms: one which permits policies * to sleep/block, and another that does not. * * MAC_POLICY_CHECK performs the designated check by walking the policy * module list and checking with each as to how it feels about the request. * Note that it returns its value via 'error' in the scope of the caller. */ #define MAC_POLICY_CHECK(check, args...) do { \ struct mac_policy_conf *mpc; \ \ error = 0; \ LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## check != NULL) \ error = mac_error_select( \ mpc->mpc_ops->mpo_ ## check (args), \ error); \ } \ if (!LIST_EMPTY(&mac_policy_list)) { \ mac_policy_slock_sleep(); \ LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## check != NULL) \ error = mac_error_select( \ mpc->mpc_ops->mpo_ ## check (args), \ error); \ } \ mac_policy_sunlock_sleep(); \ } \ } while (0) #define MAC_POLICY_CHECK_NOSLEEP(check, args...) do { \ struct mac_policy_conf *mpc; \ \ error = 0; \ LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## check != NULL) \ error = mac_error_select( \ mpc->mpc_ops->mpo_ ## check (args), \ error); \ } \ if (!LIST_EMPTY(&mac_policy_list)) { \ struct rm_priotracker tracker; \ \ mac_policy_slock_nosleep(&tracker); \ LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## check != NULL) \ error = mac_error_select( \ mpc->mpc_ops->mpo_ ## check (args), \ error); \ } \ mac_policy_sunlock_nosleep(&tracker); \ } \ } while (0) /* * MAC_POLICY_GRANT performs the designated check by walking the policy * module list and checking with each as to how it feels about the request. * Unlike MAC_POLICY_CHECK, it grants if any policies return '0', and * otherwise returns EPERM. Note that it returns its value via 'error' in * the scope of the caller. */ #define MAC_POLICY_GRANT_NOSLEEP(check, args...) do { \ struct mac_policy_conf *mpc; \ \ error = EPERM; \ LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## check != NULL) { \ if (mpc->mpc_ops->mpo_ ## check(args) == 0) \ error = 0; \ } \ } \ if (!LIST_EMPTY(&mac_policy_list)) { \ struct rm_priotracker tracker; \ \ mac_policy_slock_nosleep(&tracker); \ LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## check != NULL) { \ if (mpc->mpc_ops->mpo_ ## check (args) \ == 0) \ error = 0; \ } \ } \ mac_policy_sunlock_nosleep(&tracker); \ } \ } while (0) /* * MAC_POLICY_BOOLEAN performs the designated boolean composition by walking * the module list, invoking each instance of the operation, and combining * the results using the passed C operator. Note that it returns its value * via 'result' in the scope of the caller, which should be initialized by * the caller in a meaningful way to get a meaningful result. */ #define MAC_POLICY_BOOLEAN(operation, composition, args...) do { \ struct mac_policy_conf *mpc; \ \ LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## operation != NULL) \ result = result composition \ mpc->mpc_ops->mpo_ ## operation (args); \ } \ if (!LIST_EMPTY(&mac_policy_list)) { \ mac_policy_slock_sleep(); \ LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## operation != NULL) \ result = result composition \ mpc->mpc_ops->mpo_ ## operation \ (args); \ } \ mac_policy_sunlock_sleep(); \ } \ } while (0) #define MAC_POLICY_BOOLEAN_NOSLEEP(operation, composition, args...) do {\ struct mac_policy_conf *mpc; \ \ LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## operation != NULL) \ result = result composition \ mpc->mpc_ops->mpo_ ## operation (args); \ } \ if (!LIST_EMPTY(&mac_policy_list)) { \ struct rm_priotracker tracker; \ \ mac_policy_slock_nosleep(&tracker); \ LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## operation != NULL) \ result = result composition \ mpc->mpc_ops->mpo_ ## operation \ (args); \ } \ mac_policy_sunlock_nosleep(&tracker); \ } \ } while (0) /* * MAC_POLICY_EXTERNALIZE queries each policy to see if it can generate an * externalized version of a label element by name. Policies declare whether * they have matched a particular element name, parsed from the string by * MAC_POLICY_EXTERNALIZE, and an error is returned if any element is matched * by no policy. */ #define MAC_POLICY_EXTERNALIZE(type, label, elementlist, outbuf, \ outbuflen) do { \ int claimed, first, ignorenotfound, savedlen; \ char *element_name, *element_temp; \ struct sbuf sb; \ \ error = 0; \ first = 1; \ sbuf_new(&sb, outbuf, outbuflen, SBUF_FIXEDLEN); \ element_temp = elementlist; \ while ((element_name = strsep(&element_temp, ",")) != NULL) { \ if (element_name[0] == '?') { \ element_name++; \ ignorenotfound = 1; \ } else \ ignorenotfound = 0; \ savedlen = sbuf_len(&sb); \ if (first) \ error = sbuf_printf(&sb, "%s/", element_name); \ else \ error = sbuf_printf(&sb, ",%s/", element_name); \ if (error == -1) { \ error = EINVAL; /* XXX: E2BIG? */ \ break; \ } \ claimed = 0; \ MAC_POLICY_CHECK(type ## _externalize_label, label, \ element_name, &sb, &claimed); \ if (error) \ break; \ if (claimed == 0 && ignorenotfound) { \ /* Revert last label name. */ \ sbuf_setpos(&sb, savedlen); \ } else if (claimed != 1) { \ error = EINVAL; /* XXX: ENOLABEL? */ \ break; \ } else { \ first = 0; \ } \ } \ sbuf_finish(&sb); \ } while (0) /* * MAC_POLICY_INTERNALIZE presents parsed element names and data to each * policy to see if any is willing to claim it and internalize the label * data. If no policies match, an error is returned. */ #define MAC_POLICY_INTERNALIZE(type, label, instring) do { \ char *element, *element_name, *element_data; \ int claimed; \ \ error = 0; \ element = instring; \ while ((element_name = strsep(&element, ",")) != NULL) { \ element_data = element_name; \ element_name = strsep(&element_data, "/"); \ if (element_data == NULL) { \ error = EINVAL; \ break; \ } \ claimed = 0; \ MAC_POLICY_CHECK(type ## _internalize_label, label, \ element_name, element_data, &claimed); \ if (error) \ break; \ if (claimed != 1) { \ /* XXXMAC: Another error here? */ \ error = EINVAL; \ break; \ } \ } \ } while (0) /* * MAC_POLICY_PERFORM performs the designated operation by walking the policy * module list and invoking that operation for each policy. */ #define MAC_POLICY_PERFORM(operation, args...) do { \ struct mac_policy_conf *mpc; \ \ LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## operation != NULL) \ mpc->mpc_ops->mpo_ ## operation (args); \ } \ if (!LIST_EMPTY(&mac_policy_list)) { \ mac_policy_slock_sleep(); \ LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## operation != NULL) \ mpc->mpc_ops->mpo_ ## operation (args); \ } \ mac_policy_sunlock_sleep(); \ } \ } while (0) #define MAC_POLICY_PERFORM_NOSLEEP(operation, args...) do { \ struct mac_policy_conf *mpc; \ \ LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## operation != NULL) \ mpc->mpc_ops->mpo_ ## operation (args); \ } \ if (!LIST_EMPTY(&mac_policy_list)) { \ struct rm_priotracker tracker; \ \ mac_policy_slock_nosleep(&tracker); \ LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { \ if (mpc->mpc_ops->mpo_ ## operation != NULL) \ mpc->mpc_ops->mpo_ ## operation (args); \ } \ mac_policy_sunlock_nosleep(&tracker); \ } \ } while (0) #endif /* !_SECURITY_MAC_MAC_INTERNAL_H_ */