Index: projects/hps_head/sys/compat/linuxkpi/common/src/linux_compat.c =================================================================== --- projects/hps_head/sys/compat/linuxkpi/common/src/linux_compat.c (revision 303925) +++ projects/hps_head/sys/compat/linuxkpi/common/src/linux_compat.c (revision 303926) @@ -1,1479 +1,1477 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2016 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW, 0, "LinuxKPI parameters"); MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat"); #include /* Undo Linux compat changes. */ #undef RB_ROOT #undef file #undef cdev #define RB_ROOT(head) (head)->rbh_root struct kobject linux_class_root; struct device linux_root_device; struct class linux_class_misc; struct list_head pci_drivers; struct list_head pci_devices; struct net init_net; spinlock_t pci_lock; struct sx linux_global_rcu_lock; unsigned long linux_timer_hz_mask; int panic_cmp(struct rb_node *one, struct rb_node *two) { panic("no cmp"); } RB_GENERATE(linux_root, rb_node, __entry, panic_cmp); int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args) { va_list tmp_va; int len; char *old; char *name; char dummy; old = kobj->name; if (old && fmt == NULL) return (0); /* compute length of string */ va_copy(tmp_va, args); len = vsnprintf(&dummy, 0, fmt, tmp_va); va_end(tmp_va); /* account for zero termination */ len++; /* check for error */ if (len < 1) return (-EINVAL); /* allocate memory for string */ name = kzalloc(len, GFP_KERNEL); if (name == NULL) return (-ENOMEM); vsnprintf(name, len, fmt, args); kobj->name = name; /* free old string */ kfree(old); /* filter new string */ for (; *name != '\0'; name++) if (*name == '/') *name = '!'; return (0); } int kobject_set_name(struct kobject *kobj, const char *fmt, ...) { va_list args; int error; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); return (error); } static int kobject_add_complete(struct kobject *kobj, struct kobject *parent) { const struct kobj_type *t; int error; kobj->parent = parent; error = sysfs_create_dir(kobj); if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) { struct attribute **attr; t = kobj->ktype; for (attr = t->default_attrs; *attr != NULL; attr++) { error = sysfs_create_file(kobj, *attr); if (error) break; } if (error) sysfs_remove_dir(kobj); } return (error); } int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...) { va_list args; int error; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); if (error) return (error); return kobject_add_complete(kobj, parent); } void linux_kobject_release(struct kref *kref) { struct kobject *kobj; char *name; kobj = container_of(kref, struct kobject, kref); sysfs_remove_dir(kobj); name = kobj->name; if (kobj->ktype && kobj->ktype->release) kobj->ktype->release(kobj); kfree(name); } static void linux_kobject_kfree(struct kobject *kobj) { kfree(kobj); } static void linux_kobject_kfree_name(struct kobject *kobj) { if (kobj) { kfree(kobj->name); } } const struct kobj_type linux_kfree_type = { .release = linux_kobject_kfree }; static void linux_device_release(struct device *dev) { pr_debug("linux_device_release: %s\n", dev_name(dev)); kfree(dev); } static ssize_t linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct class_attribute *dattr; ssize_t error; dattr = container_of(attr, struct class_attribute, attr); error = -EIO; if (dattr->show) error = dattr->show(container_of(kobj, struct class, kobj), dattr, buf); return (error); } static ssize_t linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct class_attribute *dattr; ssize_t error; dattr = container_of(attr, struct class_attribute, attr); error = -EIO; if (dattr->store) error = dattr->store(container_of(kobj, struct class, kobj), dattr, buf, count); return (error); } static void linux_class_release(struct kobject *kobj) { struct class *class; class = container_of(kobj, struct class, kobj); if (class->class_release) class->class_release(class); } static const struct sysfs_ops linux_class_sysfs = { .show = linux_class_show, .store = linux_class_store, }; const struct kobj_type linux_class_ktype = { .release = linux_class_release, .sysfs_ops = &linux_class_sysfs }; static void linux_dev_release(struct kobject *kobj) { struct device *dev; dev = container_of(kobj, struct device, kobj); /* This is the precedence defined by linux. */ if (dev->release) dev->release(dev); else if (dev->class && dev->class->dev_release) dev->class->dev_release(dev); } static ssize_t linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct device_attribute *dattr; ssize_t error; dattr = container_of(attr, struct device_attribute, attr); error = -EIO; if (dattr->show) error = dattr->show(container_of(kobj, struct device, kobj), dattr, buf); return (error); } static ssize_t linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct device_attribute *dattr; ssize_t error; dattr = container_of(attr, struct device_attribute, attr); error = -EIO; if (dattr->store) error = dattr->store(container_of(kobj, struct device, kobj), dattr, buf, count); return (error); } static const struct sysfs_ops linux_dev_sysfs = { .show = linux_dev_show, .store = linux_dev_store, }; const struct kobj_type linux_dev_ktype = { .release = linux_dev_release, .sysfs_ops = &linux_dev_sysfs }; struct device * device_create(struct class *class, struct device *parent, dev_t devt, void *drvdata, const char *fmt, ...) { struct device *dev; va_list args; dev = kzalloc(sizeof(*dev), M_WAITOK); dev->parent = parent; dev->class = class; dev->devt = devt; dev->driver_data = drvdata; dev->release = linux_device_release; va_start(args, fmt); kobject_set_name_vargs(&dev->kobj, fmt, args); va_end(args); device_register(dev); return (dev); } int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...) { va_list args; int error; kobject_init(kobj, ktype); kobj->ktype = ktype; kobj->parent = parent; kobj->name = NULL; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); if (error) return (error); return kobject_add_complete(kobj, parent); } void linux_set_current(struct thread *td, struct task_struct *t) { memset(t, 0, sizeof(*t)); task_struct_fill(td, t); task_struct_set(td, t); } void linux_clear_current(struct thread *td) { task_struct_set(td, NULL); } static void linux_file_dtor(void *cdp) { struct linux_file *filp; struct task_struct t; struct thread *td; td = curthread; filp = cdp; linux_set_current(td, &t); filp->f_op->release(filp->f_vnode, filp); linux_clear_current(td); vdrop(filp->f_vnode); kfree(filp); } static int linux_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct linux_cdev *ldev; struct linux_file *filp; struct task_struct t; struct file *file; int error; file = td->td_fpop; ldev = dev->si_drv1; if (ldev == NULL) return (ENODEV); filp = kzalloc(sizeof(*filp), GFP_KERNEL); filp->f_dentry = &filp->f_dentry_store; filp->f_op = ldev->ops; filp->f_flags = file->f_flag; vhold(file->f_vnode); filp->f_vnode = file->f_vnode; linux_set_current(td, &t); if (filp->f_op->open) { error = -filp->f_op->open(file->f_vnode, filp); if (error) { kfree(filp); goto done; } } error = devfs_set_cdevpriv(filp, linux_file_dtor); if (error) { filp->f_op->release(file->f_vnode, filp); kfree(filp); } done: linux_clear_current(td); return (error); } static int linux_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct linux_cdev *ldev; struct linux_file *filp; struct file *file; int error; file = td->td_fpop; ldev = dev->si_drv1; if (ldev == NULL) return (0); if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) return (error); filp->f_flags = file->f_flag; devfs_clear_cdevpriv(); return (0); } #define LINUX_IOCTL_MIN_PTR 0x10000UL #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX) static inline int linux_remap_address(void **uaddr, size_t len) { uintptr_t uaddr_val = (uintptr_t)(*uaddr); if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR && uaddr_val < LINUX_IOCTL_MAX_PTR)) { struct task_struct *pts = current; if (pts == NULL) { *uaddr = NULL; return (1); } /* compute data offset */ uaddr_val -= LINUX_IOCTL_MIN_PTR; /* check that length is within bounds */ if ((len > IOCPARM_MAX) || (uaddr_val + len) > pts->bsd_ioctl_len) { *uaddr = NULL; return (1); } /* re-add kernel buffer address */ uaddr_val += (uintptr_t)pts->bsd_ioctl_data; /* update address location */ *uaddr = (void *)uaddr_val; return (1); } return (0); } int linux_copyin(const void *uaddr, void *kaddr, size_t len) { if (linux_remap_address(__DECONST(void **, &uaddr), len)) { if (uaddr == NULL) return (-EFAULT); memcpy(kaddr, uaddr, len); return (0); } return (-copyin(uaddr, kaddr, len)); } int linux_copyout(const void *kaddr, void *uaddr, size_t len) { if (linux_remap_address(&uaddr, len)) { if (uaddr == NULL) return (-EFAULT); memcpy(uaddr, kaddr, len); return (0); } return (-copyout(kaddr, uaddr, len)); } static int linux_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct linux_cdev *ldev; struct linux_file *filp; struct task_struct t; struct file *file; unsigned size; int error; file = td->td_fpop; ldev = dev->si_drv1; if (ldev == NULL) return (0); if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) return (error); filp->f_flags = file->f_flag; linux_set_current(td, &t); size = IOCPARM_LEN(cmd); /* refer to logic in sys_ioctl() */ if (size > 0) { /* * Setup hint for linux_copyin() and linux_copyout(). * * Background: Linux code expects a user-space address * while FreeBSD supplies a kernel-space address. */ t.bsd_ioctl_data = data; t.bsd_ioctl_len = size; data = (void *)LINUX_IOCTL_MIN_PTR; } else { /* fetch user-space pointer */ data = *(void **)data; } if (filp->f_op->unlocked_ioctl) error = -filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data); else error = ENOTTY; linux_clear_current(td); return (error); } static int linux_dev_read(struct cdev *dev, struct uio *uio, int ioflag) { struct linux_cdev *ldev; struct linux_file *filp; struct task_struct t; struct thread *td; struct file *file; ssize_t bytes; int error; td = curthread; file = td->td_fpop; ldev = dev->si_drv1; if (ldev == NULL) return (0); if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) return (error); filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); linux_set_current(td, &t); if (filp->f_op->read) { bytes = filp->f_op->read(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; } else error = -bytes; } else error = ENXIO; linux_clear_current(td); return (error); } static int linux_dev_write(struct cdev *dev, struct uio *uio, int ioflag) { struct linux_cdev *ldev; struct linux_file *filp; struct task_struct t; struct thread *td; struct file *file; ssize_t bytes; int error; td = curthread; file = td->td_fpop; ldev = dev->si_drv1; if (ldev == NULL) return (0); if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) return (error); filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); linux_set_current(td, &t); if (filp->f_op->write) { bytes = filp->f_op->write(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; } else error = -bytes; } else error = ENXIO; linux_clear_current(td); return (error); } static int linux_dev_poll(struct cdev *dev, int events, struct thread *td) { struct linux_cdev *ldev; struct linux_file *filp; struct task_struct t; struct file *file; int revents; int error; file = td->td_fpop; ldev = dev->si_drv1; if (ldev == NULL) return (0); if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) return (error); filp->f_flags = file->f_flag; linux_set_current(td, &t); if (filp->f_op->poll) revents = filp->f_op->poll(filp, NULL) & events; else revents = 0; linux_clear_current(td); return (revents); } static int linux_dev_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { struct linux_cdev *ldev; struct linux_file *filp; struct thread *td; struct task_struct t; struct file *file; struct vm_area_struct vma; int error; td = curthread; file = td->td_fpop; ldev = dev->si_drv1; if (ldev == NULL) return (ENODEV); if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) return (error); filp->f_flags = file->f_flag; linux_set_current(td, &t); vma.vm_start = 0; vma.vm_end = size; vma.vm_pgoff = *offset / PAGE_SIZE; vma.vm_pfn = 0; vma.vm_page_prot = VM_MEMATTR_DEFAULT; if (filp->f_op->mmap) { error = -filp->f_op->mmap(filp, &vma); if (error == 0) { struct sglist *sg; sg = sglist_alloc(1, M_WAITOK); sglist_append_phys(sg, (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT, vma.vm_len); *object = vm_pager_allocate(OBJT_SG, sg, vma.vm_len, nprot, 0, td->td_ucred); if (*object == NULL) { sglist_free(sg); error = EINVAL; goto done; } *offset = 0; if (vma.vm_page_prot != VM_MEMATTR_DEFAULT) { VM_OBJECT_WLOCK(*object); vm_object_set_memattr(*object, vma.vm_page_prot); VM_OBJECT_WUNLOCK(*object); } } } else error = ENODEV; done: linux_clear_current(td); return (error); } struct cdevsw linuxcdevsw = { .d_version = D_VERSION, .d_flags = D_TRACKCLOSE, .d_open = linux_dev_open, .d_close = linux_dev_close, .d_read = linux_dev_read, .d_write = linux_dev_write, .d_ioctl = linux_dev_ioctl, .d_mmap_single = linux_dev_mmap_single, .d_poll = linux_dev_poll, }; static int linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; struct task_struct t; ssize_t bytes; int error; error = 0; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); linux_set_current(td, &t); if (filp->f_op->read) { bytes = filp->f_op->read(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; } else error = -bytes; } else error = ENXIO; linux_clear_current(td); return (error); } static int linux_file_poll(struct file *file, int events, struct ucred *active_cred, struct thread *td) { struct linux_file *filp; struct task_struct t; int revents; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; linux_set_current(td, &t); if (filp->f_op->poll) revents = filp->f_op->poll(filp, NULL) & events; else revents = 0; linux_clear_current(td); return (revents); } static int linux_file_close(struct file *file, struct thread *td) { struct linux_file *filp; struct task_struct t; int error; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; linux_set_current(td, &t); error = -filp->f_op->release(NULL, filp); linux_clear_current(td); funsetown(&filp->f_sigio); kfree(filp); return (error); } static int linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, struct thread *td) { struct linux_file *filp; struct task_struct t; int error; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; error = 0; linux_set_current(td, &t); switch (cmd) { case FIONBIO: break; case FIOASYNC: if (filp->f_op->fasync == NULL) break; error = filp->f_op->fasync(0, filp, fp->f_flag & FASYNC); break; case FIOSETOWN: error = fsetown(*(int *)data, &filp->f_sigio); if (error == 0) error = filp->f_op->fasync(0, filp, fp->f_flag & FASYNC); break; case FIOGETOWN: *(int *)data = fgetown(&filp->f_sigio); break; default: error = ENOTTY; break; } linux_clear_current(td); return (error); } static int linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EOPNOTSUPP); } static int linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } struct fileops linuxfileops = { .fo_read = linux_file_read, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_kqfilter = invfo_kqfilter, .fo_stat = linux_file_stat, .fo_fill_kinfo = linux_file_fill_kinfo, .fo_poll = linux_file_poll, .fo_close = linux_file_close, .fo_ioctl = linux_file_ioctl, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, }; /* * Hash of vmmap addresses. This is infrequently accessed and does not * need to be particularly large. This is done because we must store the * caller's idea of the map size to properly unmap. */ struct vmmap { LIST_ENTRY(vmmap) vm_next; void *vm_addr; unsigned long vm_size; }; struct vmmaphd { struct vmmap *lh_first; }; #define VMMAP_HASH_SIZE 64 #define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1) #define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE]; static struct mtx vmmaplock; static void vmmap_add(void *addr, unsigned long size) { struct vmmap *vmmap; vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL); mtx_lock(&vmmaplock); vmmap->vm_size = size; vmmap->vm_addr = addr; LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next); mtx_unlock(&vmmaplock); } static struct vmmap * vmmap_remove(void *addr) { struct vmmap *vmmap; mtx_lock(&vmmaplock); LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next) if (vmmap->vm_addr == addr) break; if (vmmap) LIST_REMOVE(vmmap, vm_next); mtx_unlock(&vmmaplock); return (vmmap); } #if defined(__i386__) || defined(__amd64__) void * _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr) { void *addr; addr = pmap_mapdev_attr(phys_addr, size, attr); if (addr == NULL) return (NULL); vmmap_add(addr, size); return (addr); } #endif void iounmap(void *addr) { struct vmmap *vmmap; vmmap = vmmap_remove(addr); if (vmmap == NULL) return; #if defined(__i386__) || defined(__amd64__) pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size); #endif kfree(vmmap); } void * vmap(struct page **pages, unsigned int count, unsigned long flags, int prot) { vm_offset_t off; size_t size; size = count * PAGE_SIZE; off = kva_alloc(size); if (off == 0) return (NULL); vmmap_add((void *)off, size); pmap_qenter(off, pages, count); return ((void *)off); } void vunmap(void *addr) { struct vmmap *vmmap; vmmap = vmmap_remove(addr); if (vmmap == NULL) return; pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE); kva_free((vm_offset_t)addr, vmmap->vm_size); kfree(vmmap); } char * kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { unsigned int len; char *p; va_list aq; va_copy(aq, ap); len = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = kmalloc(len + 1, gfp); if (p != NULL) vsnprintf(p, len + 1, fmt, ap); return (p); } char * kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return (p); } static void linux_timer_callback_wrapper(void *context) { struct timer_list *timer; timer = context; timer->function(timer->data); } void mod_timer(struct timer_list *timer, unsigned long expires) { timer->expires = expires; callout_reset(&timer->timer_callout, linux_timer_jiffies_until(expires), &linux_timer_callback_wrapper, timer); } void add_timer(struct timer_list *timer) { callout_reset(&timer->timer_callout, linux_timer_jiffies_until(timer->expires), &linux_timer_callback_wrapper, timer); } static void linux_timer_init(void *arg) { /* * Compute an internal HZ value which can divide 2**32 to * avoid timer rounding problems when the tick value wraps * around 2**32: */ linux_timer_hz_mask = 1; while (linux_timer_hz_mask < (unsigned long)hz) linux_timer_hz_mask *= 2; linux_timer_hz_mask--; } SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL); void linux_complete_common(struct completion *c, int all) { int wakeup_swapper; sleepq_lock(c); c->done++; if (all) wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); else wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); sleepq_release(c); if (wakeup_swapper) kick_proc0(); } /* * Indefinite wait for done != 0 with or without signals. */ long linux_wait_for_common(struct completion *c, int flags) { if (SCHEDULER_STOPPED()) return (0); if (flags != 0) flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; else flags = SLEEPQ_SLEEP; for (;;) { sleepq_lock(c); if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); if (flags & SLEEPQ_INTERRUPTIBLE) { if (sleepq_wait_sig(c, 0) != 0) return (-ERESTARTSYS); } else sleepq_wait(c, 0); } c->done--; sleepq_release(c); return (0); } /* * Time limited wait for done != 0 with or without signals. */ long linux_wait_for_timeout_common(struct completion *c, long timeout, int flags) { long end = jiffies + timeout; if (SCHEDULER_STOPPED()) return (0); if (flags != 0) flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; else flags = SLEEPQ_SLEEP; for (;;) { int ret; sleepq_lock(c); if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); - sleepq_release(c); sleepq_set_timeout(c, linux_timer_jiffies_until(end)); - sleepq_lock(c); if (flags & SLEEPQ_INTERRUPTIBLE) ret = sleepq_timedwait_sig(c, 0); else ret = sleepq_timedwait(c, 0); if (ret != 0) { /* check for timeout or signal */ if (ret == EWOULDBLOCK) return (0); else return (-ERESTARTSYS); } } c->done--; sleepq_release(c); /* return how many jiffies are left */ return (linux_timer_jiffies_until(end)); } int linux_try_wait_for_completion(struct completion *c) { int isdone; isdone = 1; sleepq_lock(c); if (c->done) c->done--; else isdone = 0; sleepq_release(c); return (isdone); } int linux_completion_done(struct completion *c) { int isdone; isdone = 1; sleepq_lock(c); if (c->done == 0) isdone = 0; sleepq_release(c); return (isdone); } void linux_delayed_work_fn(void *arg) { struct delayed_work *work; work = arg; taskqueue_enqueue(work->work.taskqueue, &work->work.work_task); } void linux_work_fn(void *context, int pending) { struct work_struct *work; work = context; work->fn(work); } void linux_flush_fn(void *context, int pending) { } struct workqueue_struct * linux_create_workqueue_common(const char *name, int cpus) { struct workqueue_struct *wq; wq = kmalloc(sizeof(*wq), M_WAITOK); wq->taskqueue = taskqueue_create(name, M_WAITOK, taskqueue_thread_enqueue, &wq->taskqueue); atomic_set(&wq->draining, 0); taskqueue_start_threads(&wq->taskqueue, cpus, PWAIT, "%s", name); return (wq); } void destroy_workqueue(struct workqueue_struct *wq) { taskqueue_free(wq->taskqueue); kfree(wq); } static void linux_cdev_release(struct kobject *kobj) { struct linux_cdev *cdev; struct kobject *parent; cdev = container_of(kobj, struct linux_cdev, kobj); parent = kobj->parent; if (cdev->cdev) destroy_dev(cdev->cdev); kfree(cdev); kobject_put(parent); } static void linux_cdev_static_release(struct kobject *kobj) { struct linux_cdev *cdev; struct kobject *parent; cdev = container_of(kobj, struct linux_cdev, kobj); parent = kobj->parent; if (cdev->cdev) destroy_dev(cdev->cdev); kobject_put(parent); } const struct kobj_type linux_cdev_ktype = { .release = linux_cdev_release, }; const struct kobj_type linux_cdev_static_ktype = { .release = linux_cdev_static_release, }; static void linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate) { struct notifier_block *nb; nb = arg; if (linkstate == LINK_STATE_UP) nb->notifier_call(nb, NETDEV_UP, ifp); else nb->notifier_call(nb, NETDEV_DOWN, ifp); } static void linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; nb = arg; nb->notifier_call(nb, NETDEV_REGISTER, ifp); } static void linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; nb = arg; nb->notifier_call(nb, NETDEV_UNREGISTER, ifp); } static void linux_handle_iflladdr_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; nb = arg; nb->notifier_call(nb, NETDEV_CHANGEADDR, ifp); } static void linux_handle_ifaddr_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; nb = arg; nb->notifier_call(nb, NETDEV_CHANGEIFADDR, ifp); } int register_netdevice_notifier(struct notifier_block *nb) { nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER( ifnet_link_event, linux_handle_ifnet_link_event, nb, 0); nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER( ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0); nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER( ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0); nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER( iflladdr_event, linux_handle_iflladdr_event, nb, 0); return (0); } int register_inetaddr_notifier(struct notifier_block *nb) { nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER( ifaddr_event, linux_handle_ifaddr_event, nb, 0); return (0); } int unregister_netdevice_notifier(struct notifier_block *nb) { EVENTHANDLER_DEREGISTER(ifnet_link_event, nb->tags[NETDEV_UP]); EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nb->tags[NETDEV_REGISTER]); EVENTHANDLER_DEREGISTER(ifnet_departure_event, nb->tags[NETDEV_UNREGISTER]); EVENTHANDLER_DEREGISTER(iflladdr_event, nb->tags[NETDEV_CHANGEADDR]); return (0); } int unregister_inetaddr_notifier(struct notifier_block *nb) { EVENTHANDLER_DEREGISTER(ifaddr_event, nb->tags[NETDEV_CHANGEIFADDR]); return (0); } struct list_sort_thunk { int (*cmp)(void *, struct list_head *, struct list_head *); void *priv; }; static inline int linux_le_cmp(void *priv, const void *d1, const void *d2) { struct list_head *le1, *le2; struct list_sort_thunk *thunk; thunk = priv; le1 = *(__DECONST(struct list_head **, d1)); le2 = *(__DECONST(struct list_head **, d2)); return ((thunk->cmp)(thunk->priv, le1, le2)); } void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)) { struct list_sort_thunk thunk; struct list_head **ar, *le; size_t count, i; count = 0; list_for_each(le, head) count++; ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK); i = 0; list_for_each(le, head) ar[i++] = le; thunk.cmp = cmp; thunk.priv = priv; qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp); INIT_LIST_HEAD(head); for (i = 0; i < count; i++) list_add_tail(ar[i], head); free(ar, M_KMALLOC); } void linux_irq_handler(void *ent) { struct irq_ent *irqe; irqe = ent; irqe->handler(irqe->irq, irqe->arg); } #if defined(__i386__) || defined(__amd64__) bool linux_cpu_has_clflush; #endif static void linux_compat_init(void *arg) { struct sysctl_oid *rootoid; int i; #if defined(__i386__) || defined(__amd64__) linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH); #endif sx_init(&linux_global_rcu_lock, "LinuxGlobalRCU"); rootoid = SYSCTL_ADD_ROOT_NODE(NULL, OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); kobject_init(&linux_class_root, &linux_class_ktype); kobject_set_name(&linux_class_root, "class"); linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class"); kobject_init(&linux_root_device.kobj, &linux_dev_ktype); kobject_set_name(&linux_root_device.kobj, "device"); linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL, "device"); linux_root_device.bsddev = root_bus; linux_class_misc.name = "misc"; class_register(&linux_class_misc); INIT_LIST_HEAD(&pci_drivers); INIT_LIST_HEAD(&pci_devices); spin_lock_init(&pci_lock); mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF); for (i = 0; i < VMMAP_HASH_SIZE; i++) LIST_INIT(&vmmaphead[i]); } SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL); static void linux_compat_uninit(void *arg) { linux_kobject_kfree_name(&linux_class_root); linux_kobject_kfree_name(&linux_root_device.kobj); linux_kobject_kfree_name(&linux_class_misc.kobj); synchronize_rcu(); sx_destroy(&linux_global_rcu_lock); } SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL); /* * NOTE: Linux frequently uses "unsigned long" for pointer to integer * conversion and vice versa, where in FreeBSD "uintptr_t" would be * used. Assert these types have the same size, else some parts of the * LinuxKPI may not work like expected: */ CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t)); Index: projects/hps_head/sys/ddb/db_ps.c =================================================================== --- projects/hps_head/sys/ddb/db_ps.c (revision 303925) +++ projects/hps_head/sys/ddb/db_ps.c (revision 303926) @@ -1,483 +1,488 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include static void dumpthread(volatile struct proc *p, volatile struct thread *td, int all); /* * At least one non-optional show-command must be implemented using * DB_SHOW_ALL_COMMAND() so that db_show_all_cmd_set gets created. * Here is one. */ DB_SHOW_ALL_COMMAND(procs, db_procs_cmd) { db_ps(addr, have_addr, count, modif); } /* * Layout: * - column counts * - header * - single-threaded process * - multi-threaded process * - thread in a MT process * * 1 2 3 4 5 6 7 * 1234567890123456789012345678901234567890123456789012345678901234567890 * pid ppid pgrp uid state wmesg wchan cmd * < wmesg > < wchan > * (threaded) * < wmesg > < wchan > * * For machines with 64-bit pointers, we expand the wchan field 8 more * characters. */ void db_ps(db_expr_t addr, bool hasaddr, db_expr_t count, char *modif) { volatile struct proc *p, *pp; volatile struct thread *td; struct ucred *cred; struct pgrp *pgrp; char state[9]; int np, rflag, sflag, dflag, lflag, wflag; np = nprocs; if (!LIST_EMPTY(&allproc)) p = LIST_FIRST(&allproc); else p = &proc0; #ifdef __LP64__ db_printf(" pid ppid pgrp uid state wmesg wchan cmd\n"); #else db_printf(" pid ppid pgrp uid state wmesg wchan cmd\n"); #endif while (--np >= 0 && !db_pager_quit) { if (p == NULL) { db_printf("oops, ran out of processes early!\n"); break; } pp = p->p_pptr; if (pp == NULL) pp = p; cred = p->p_ucred; pgrp = p->p_pgrp; db_printf("%5d %5d %5d %5d ", p->p_pid, pp->p_pid, pgrp != NULL ? pgrp->pg_id : 0, cred != NULL ? cred->cr_ruid : 0); /* Determine our primary process state. */ switch (p->p_state) { case PRS_NORMAL: if (P_SHOULDSTOP(p)) state[0] = 'T'; else { /* * One of D, L, R, S, W. For a * multithreaded process we will use * the state of the thread with the * highest precedence. The * precendence order from high to low * is R, L, D, S, W. If no thread is * in a sane state we use '?' for our * primary state. */ rflag = sflag = dflag = lflag = wflag = 0; FOREACH_THREAD_IN_PROC(p, td) { if (td->td_state == TDS_RUNNING || td->td_state == TDS_RUNQ || td->td_state == TDS_CAN_RUN) rflag++; if (TD_ON_LOCK(td)) lflag++; if (TD_IS_SLEEPING(td)) { if (!(td->td_flags & TDF_SINTR)) dflag++; else sflag++; } if (TD_AWAITING_INTR(td)) wflag++; } if (rflag) state[0] = 'R'; else if (lflag) state[0] = 'L'; else if (dflag) state[0] = 'D'; else if (sflag) state[0] = 'S'; else if (wflag) state[0] = 'W'; else state[0] = '?'; } break; case PRS_NEW: state[0] = 'N'; break; case PRS_ZOMBIE: state[0] = 'Z'; break; default: state[0] = 'U'; break; } state[1] = '\0'; /* Additional process state flags. */ if (!(p->p_flag & P_INMEM)) strlcat(state, "W", sizeof(state)); if (p->p_flag & P_TRACED) strlcat(state, "X", sizeof(state)); if (p->p_flag & P_WEXIT && p->p_state != PRS_ZOMBIE) strlcat(state, "E", sizeof(state)); if (p->p_flag & P_PPWAIT) strlcat(state, "V", sizeof(state)); if (p->p_flag & P_SYSTEM || p->p_lock > 0) strlcat(state, "L", sizeof(state)); if (p->p_pgrp != NULL && p->p_session != NULL && SESS_LEADER(p)) strlcat(state, "s", sizeof(state)); /* Cheated here and didn't compare pgid's. */ if (p->p_flag & P_CONTROLT) strlcat(state, "+", sizeof(state)); if (cred != NULL && jailed(cred)) strlcat(state, "J", sizeof(state)); db_printf(" %-6.6s ", state); if (p->p_flag & P_HADTHREADS) { #ifdef __LP64__ db_printf(" (threaded) "); #else db_printf(" (threaded) "); #endif if (p->p_flag & P_SYSTEM) db_printf("["); db_printf("%s", p->p_comm); if (p->p_flag & P_SYSTEM) db_printf("]"); db_printf("\n"); } FOREACH_THREAD_IN_PROC(p, td) { dumpthread(p, td, p->p_flag & P_HADTHREADS); if (db_pager_quit) break; } p = LIST_NEXT(p, p_list); if (p == NULL && np > 0) p = LIST_FIRST(&zombproc); } } static void dumpthread(volatile struct proc *p, volatile struct thread *td, int all) { char state[9], wprefix; const char *wmesg; void *wchan; if (all) { db_printf("%6d ", td->td_tid); switch (td->td_state) { case TDS_RUNNING: snprintf(state, sizeof(state), "Run"); break; case TDS_RUNQ: snprintf(state, sizeof(state), "RunQ"); break; case TDS_CAN_RUN: snprintf(state, sizeof(state), "CanRun"); break; case TDS_INACTIVE: snprintf(state, sizeof(state), "Inactv"); break; case TDS_INHIBITED: state[0] = '\0'; if (TD_ON_LOCK(td)) strlcat(state, "L", sizeof(state)); if (TD_IS_SLEEPING(td)) { if (td->td_flags & TDF_SINTR) strlcat(state, "S", sizeof(state)); else strlcat(state, "D", sizeof(state)); } if (TD_IS_SWAPPED(td)) strlcat(state, "W", sizeof(state)); if (TD_AWAITING_INTR(td)) strlcat(state, "I", sizeof(state)); if (TD_IS_SUSPENDED(td)) strlcat(state, "s", sizeof(state)); if (state[0] != '\0') break; default: snprintf(state, sizeof(state), "???"); } db_printf(" %-6.6s ", state); } wprefix = ' '; if (TD_ON_LOCK(td)) { wprefix = '*'; wmesg = td->td_lockname; wchan = td->td_blocked; } else if (TD_ON_SLEEPQ(td)) { wmesg = td->td_wmesg; wchan = td->td_wchan; } else if (TD_IS_RUNNING(td)) { snprintf(state, sizeof(state), "CPU %d", td->td_oncpu); wmesg = state; wchan = NULL; } else { wmesg = ""; wchan = NULL; } db_printf("%c%-8.8s ", wprefix, wmesg); if (wchan == NULL) #ifdef __LP64__ db_printf("%18s ", ""); #else db_printf("%10s ", ""); #endif else db_printf("%p ", wchan); if (p->p_flag & P_SYSTEM) db_printf("["); if (td->td_name[0] != '\0') db_printf("%s", td->td_name); else db_printf("%s", td->td_proc->p_comm); if (p->p_flag & P_SYSTEM) db_printf("]"); db_printf("\n"); } DB_SHOW_COMMAND(thread, db_show_thread) { struct thread *td; struct lock_object *lock; bool comma; int delta; /* Determine which thread to examine. */ if (have_addr) td = db_lookup_thread(addr, false); else td = kdb_thread; lock = (struct lock_object *)td->td_lock; db_printf("Thread %d at %p:\n", td->td_tid, td); db_printf(" proc (pid %d): %p\n", td->td_proc->p_pid, td->td_proc); if (td->td_name[0] != '\0') db_printf(" name: %s\n", td->td_name); db_printf(" stack: %p-%p\n", (void *)td->td_kstack, (void *)(td->td_kstack + td->td_kstack_pages * PAGE_SIZE - 1)); db_printf(" flags: %#x ", td->td_flags); db_printf(" pflags: %#x\n", td->td_pflags); db_printf(" state: "); switch (td->td_state) { case TDS_INACTIVE: db_printf("INACTIVE\n"); break; case TDS_CAN_RUN: db_printf("CAN RUN\n"); break; case TDS_RUNQ: db_printf("RUNQ\n"); break; case TDS_RUNNING: db_printf("RUNNING (CPU %d)\n", td->td_oncpu); break; case TDS_INHIBITED: db_printf("INHIBITED: {"); comma = false; if (TD_IS_SLEEPING(td)) { db_printf("SLEEPING"); comma = true; } if (TD_IS_SUSPENDED(td)) { if (comma) db_printf(", "); db_printf("SUSPENDED"); comma = true; } if (TD_IS_SWAPPED(td)) { if (comma) db_printf(", "); db_printf("SWAPPED"); comma = true; } if (TD_ON_LOCK(td)) { if (comma) db_printf(", "); db_printf("LOCK"); comma = true; } if (TD_AWAITING_INTR(td)) { if (comma) db_printf(", "); db_printf("IWAIT"); } db_printf("}\n"); break; default: db_printf("??? (%#x)\n", td->td_state); break; } if (TD_ON_LOCK(td)) db_printf(" lock: %s turnstile: %p\n", td->td_lockname, td->td_blocked); if (TD_ON_SLEEPQ(td)) - db_printf(" wmesg: %s wchan: %p\n", td->td_wmesg, - td->td_wchan); + db_printf( + " wmesg: %s wchan: %p sleeptimo %lx. %jx (curr %lx. %jx)\n", + td->td_wmesg, td->td_wchan, + (long)sbttobt(td->td_sleeptimo).sec, + (uintmax_t)sbttobt(td->td_sleeptimo).frac, + (long)sbttobt(sbinuptime()).sec, + (uintmax_t)sbttobt(sbinuptime()).frac); db_printf(" priority: %d\n", td->td_priority); db_printf(" container lock: %s (%p)\n", lock->lo_name, lock); if (td->td_swvoltick != 0) { delta = (u_int)ticks - (u_int)td->td_swvoltick; db_printf(" last voluntary switch: %d ms ago\n", 1000 * delta / hz); } if (td->td_swinvoltick != 0) { delta = (u_int)ticks - (u_int)td->td_swinvoltick; db_printf(" last involuntary switch: %d ms ago\n", 1000 * delta / hz); } } DB_SHOW_COMMAND(proc, db_show_proc) { struct thread *td; struct proc *p; int i; /* Determine which process to examine. */ if (have_addr) p = db_lookup_proc(addr); else p = kdb_thread->td_proc; db_printf("Process %d (%s) at %p:\n", p->p_pid, p->p_comm, p); db_printf(" state: "); switch (p->p_state) { case PRS_NEW: db_printf("NEW\n"); break; case PRS_NORMAL: db_printf("NORMAL\n"); break; case PRS_ZOMBIE: db_printf("ZOMBIE\n"); break; default: db_printf("??? (%#x)\n", p->p_state); } if (p->p_ucred != NULL) { db_printf(" uid: %d gids: ", p->p_ucred->cr_uid); for (i = 0; i < p->p_ucred->cr_ngroups; i++) { db_printf("%d", p->p_ucred->cr_groups[i]); if (i < (p->p_ucred->cr_ngroups - 1)) db_printf(", "); } db_printf("\n"); } if (p->p_pptr != NULL) db_printf(" parent: pid %d at %p\n", p->p_pptr->p_pid, p->p_pptr); if (p->p_leader != NULL && p->p_leader != p) db_printf(" leader: pid %d at %p\n", p->p_leader->p_pid, p->p_leader); if (p->p_sysent != NULL) db_printf(" ABI: %s\n", p->p_sysent->sv_name); if (p->p_args != NULL) db_printf(" arguments: %.*s\n", (int)p->p_args->ar_length, p->p_args->ar_args); db_printf(" threads: %d\n", p->p_numthreads); FOREACH_THREAD_IN_PROC(p, td) { dumpthread(p, td, 1); if (db_pager_quit) break; } } void db_findstack_cmd(db_expr_t addr, bool have_addr, db_expr_t dummy3 __unused, char *dummy4 __unused) { struct proc *p; struct thread *td; struct kstack_cache_entry *ks_ce; vm_offset_t saddr; if (have_addr) saddr = addr; else { db_printf("Usage: findstack
\n"); return; } FOREACH_PROC_IN_SYSTEM(p) { FOREACH_THREAD_IN_PROC(p, td) { if (td->td_kstack <= saddr && saddr < td->td_kstack + PAGE_SIZE * td->td_kstack_pages) { db_printf("Thread %p\n", td); return; } } } for (ks_ce = kstack_cache; ks_ce != NULL; ks_ce = ks_ce->next_ks_entry) { if ((vm_offset_t)ks_ce <= saddr && saddr < (vm_offset_t)ks_ce + PAGE_SIZE * kstack_pages) { db_printf("Cached stack %p\n", ks_ce); return; } } } Index: projects/hps_head/sys/kern/init_main.c =================================================================== --- projects/hps_head/sys/kern/init_main.c (revision 303925) +++ projects/hps_head/sys/kern/init_main.c (revision 303926) @@ -1,885 +1,884 @@ /*- * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(16); struct vmspace vmspace0; struct proc *initproc; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { register struct sysinit **sipp; /* system initialization*/ register struct sysinit **xipp; /* interior loop of sort*/ register struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } /* *************************************************************************** **** **** The following SYSINIT's belong elsewhere, but have not yet **** been moved. **** *************************************************************************** */ static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused, struct syscall_args *sa __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* *************************************************************************** **** **** The two following SYSINIT's are proc0 specific glue code. I am not **** convinced that they can not be safely combined, but their order of **** operation has been maintained as the same as the original init_main.c **** for right now. **** **** These probably belong in init_proc.c or kern_proc.c, since they **** deal with proc0 (the fork template process). **** *************************************************************************** */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); vm_domain_policy_init(&td->td_vm_dom_policy); vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); vm_domain_policy_init(&p->p_vm_dom_policy); vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); - mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN); - callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0); + callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_prison = &prison0; newcred->cr_loginclass = loginclass_find("default"); proc_set_cred_init(p, newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_INVOKE(process_init, p); EVENTHANDLER_INVOKE(thread_init, td); EVENTHANDLER_INVOKE(process_ctor, p); EVENTHANDLER_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ PROC_STATUNLOCK(p); p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); static void random_init(void *dummy __unused) { /* * After CPU has been started we have some randomness on most * platforms via get_cyclecount(). For platforms that don't * we will reseed random(9) in proc0_post() as well. */ srandom(get_cyclecount()); } SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* *************************************************************************** **** **** The following code probably belongs in another file, like **** kern/init_init.c. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)rounddown2((intptr_t)ucp, sizeof(intptr_t)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = sys_execve(td, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in it's own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crfree(td->td_ucred); td->td_ucred = crhold(initproc->p_ucred); PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); Index: projects/hps_head/sys/kern/kern_condvar.c =================================================================== --- projects/hps_head/sys/kern/kern_condvar.c (revision 303925) +++ projects/hps_head/sys/kern/kern_condvar.c (revision 303926) @@ -1,438 +1,442 @@ /*- * Copyright (c) 2000 Jake Burkholder . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif /* * A bound below which cv_waiters is valid. Once cv_waiters reaches this bound, * cv_signal must manually check the wait queue for threads. */ #define CV_WAITERS_BOUND INT_MAX #define CV_WAITERS_INC(cvp) do { \ if ((cvp)->cv_waiters < CV_WAITERS_BOUND) \ (cvp)->cv_waiters++; \ } while (0) /* * Common sanity checks for cv_wait* functions. */ #define CV_ASSERT(cvp, lock, td) do { \ KASSERT((td) != NULL, ("%s: td NULL", __func__)); \ KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__)); \ KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \ KASSERT((lock) != NULL, ("%s: lock NULL", __func__)); \ } while (0) /* * Initialize a condition variable. Must be called before use. */ void cv_init(struct cv *cvp, const char *desc) { cvp->cv_description = desc; cvp->cv_waiters = 0; } /* * Destroy a condition variable. The condition variable must be re-initialized * in order to be re-used. */ void cv_destroy(struct cv *cvp) { #ifdef INVARIANTS struct sleepqueue *sq; sleepq_lock(cvp); sq = sleepq_lookup(cvp); sleepq_release(cvp); KASSERT(sq == NULL, ("%s: associated sleep queue non-empty", __func__)); #endif } /* * Wait on a condition variable. The current thread is placed on the condition * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same * condition variable will resume the thread. The mutex is released before * sleeping and will be held on return. It is recommended that the mutex be * held when cv_signal or cv_broadcast are called. */ void _cv_wait(struct cv *cvp, struct lock_object *lock) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; uintptr_t lock_state; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED()) return; sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } sleepq_wait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } } /* * Wait on a condition variable. This function differs from cv_wait by * not acquiring the mutex after condition variable was signaled. */ void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock) { struct lock_class *class; struct thread *td; td = curthread; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); KASSERT(lock != &Giant.lock_object, ("cv_wait_unlock cannot be used with Giant")); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED()) { class->lc_unlock(lock); return; } sleepq_lock(cvp); CV_WAITERS_INC(cvp); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); sleepq_wait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); } /* * Wait on a condition variable, allowing interruption by signals. Return 0 if * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if * a signal was caught. If ERESTART is returned the system call should be * restarted if possible. */ int _cv_wait_sig(struct cv *cvp, struct lock_object *lock) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; uintptr_t lock_state; int rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED()) return (0); sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_wait_sig(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Wait on a condition variable for (at most) the value specified in sbt * argument. Returns 0 if the process was resumed by cv_signal or cv_broadcast, * EWOULDBLOCK if the timeout expires. */ int _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt, sbintime_t pr, int flags) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED()) return (0); sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); - sleepq_release(cvp); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { + if (class->lc_flags & LC_SLEEPABLE) + sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); + if (class->lc_flags & LC_SLEEPABLE) + sleepq_lock(cvp); } - sleepq_lock(cvp); rval = sleepq_timedwait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Wait on a condition variable for (at most) the value specified in sbt * argument, allowing interruption by signals. * Returns 0 if the thread was resumed by cv_signal or cv_broadcast, * EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if a signal * was caught. */ int _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt, sbintime_t pr, int flags) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED()) return (0); sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); - sleepq_release(cvp); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { + if (class->lc_flags & LC_SLEEPABLE) + sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); + if (class->lc_flags & LC_SLEEPABLE) + sleepq_lock(cvp); } - sleepq_lock(cvp); rval = sleepq_timedwait_sig(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Signal a condition variable, wakes up one waiting thread. Will also wakeup * the swapper if the process is not in memory, so that it can bring the * sleeping process in. Note that this may also result in additional threads * being made runnable. Should be called with the same mutex as was passed to * cv_wait held. */ void cv_signal(struct cv *cvp) { int wakeup_swapper; wakeup_swapper = 0; sleepq_lock(cvp); if (cvp->cv_waiters > 0) { if (cvp->cv_waiters == CV_WAITERS_BOUND && sleepq_lookup(cvp) == NULL) { cvp->cv_waiters = 0; } else { if (cvp->cv_waiters < CV_WAITERS_BOUND) cvp->cv_waiters--; wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0, 0); } } sleepq_release(cvp); if (wakeup_swapper) kick_proc0(); } /* * Broadcast a signal to a condition variable. Wakes up all waiting threads. * Should be called with the same mutex as was passed to cv_wait held. */ void cv_broadcastpri(struct cv *cvp, int pri) { int wakeup_swapper; /* * XXX sleepq_broadcast pri argument changed from -1 meaning * no pri to 0 meaning no pri. */ wakeup_swapper = 0; if (pri == -1) pri = 0; sleepq_lock(cvp); if (cvp->cv_waiters > 0) { cvp->cv_waiters = 0; wakeup_swapper = sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0); } sleepq_release(cvp); if (wakeup_swapper) kick_proc0(); } Index: projects/hps_head/sys/kern/kern_lock.c =================================================================== --- projects/hps_head/sys/kern/kern_lock.c (revision 303925) +++ projects/hps_head/sys/kern/kern_lock.c (revision 303926) @@ -1,1555 +1,1553 @@ /*- * Copyright (c) 2008 Attilio Rao * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_adaptive_lockmgrs.h" #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #ifdef DEBUG_LOCKS #include #endif #include #include #include #ifdef DDB #include #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) == (LK_ADAPTIVE | LK_NOSHARE)); CTASSERT(LK_UNLOCKED == (LK_UNLOCKED & ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS))); #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 #ifndef INVARIANTS #define _lockmgr_assert(lk, what, file, line) #endif #define TD_SLOCKS_INC(td) ((td)->td_lk_slocks++) #define TD_SLOCKS_DEC(td) ((td)->td_lk_slocks--) #ifndef DEBUG_LOCKS #define STACK_PRINT(lk) #define STACK_SAVE(lk) #define STACK_ZERO(lk) #else #define STACK_PRINT(lk) stack_print_ddb(&(lk)->lk_stack) #define STACK_SAVE(lk) stack_save(&(lk)->lk_stack) #define STACK_ZERO(lk) stack_zero(&(lk)->lk_stack) #endif #define LOCK_LOG2(lk, string, arg1, arg2) \ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ CTR2(KTR_LOCK, (string), (arg1), (arg2)) #define LOCK_LOG3(lk, string, arg1, arg2, arg3) \ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3)) #define GIANT_DECLARE \ int _i = 0; \ WITNESS_SAVE_DECL(Giant) #define GIANT_RESTORE() do { \ if (_i > 0) { \ while (_i--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) #define GIANT_SAVE() do { \ if (mtx_owned(&Giant)) { \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _i++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) #define LK_CAN_SHARE(x, flags) \ (((x) & LK_SHARE) && \ (((x) & (LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) == 0 || \ (curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) || \ (curthread->td_pflags & TDP_DEADLKTREAT))) #define LK_TRYOP(x) \ ((x) & LK_NOWAIT) #define LK_CAN_WITNESS(x) \ (((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x)) #define LK_TRYWIT(x) \ (LK_TRYOP(x) ? LOP_TRYLOCK : 0) #define LK_CAN_ADAPT(lk, f) \ (((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 && \ ((f) & LK_SLEEPFAIL) == 0) #define lockmgr_disowned(lk) \ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC) #define lockmgr_xlocked(lk) \ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread) static void assert_lockmgr(const struct lock_object *lock, int how); #ifdef DDB static void db_show_lockmgr(const struct lock_object *lock); #endif static void lock_lockmgr(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_lockmgr(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_lockmgr(struct lock_object *lock); struct lock_class lock_class_lockmgr = { .lc_name = "lockmgr", .lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE, .lc_assert = assert_lockmgr, #ifdef DDB .lc_ddb_show = db_show_lockmgr, #endif .lc_lock = lock_lockmgr, .lc_unlock = unlock_lockmgr, #ifdef KDTRACE_HOOKS .lc_owner = owner_lockmgr, #endif }; #ifdef ADAPTIVE_LOCKMGRS static u_int alk_retries = 10; static u_int alk_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, lockmgr, CTLFLAG_RD, NULL, "lockmgr debugging"); SYSCTL_UINT(_debug_lockmgr, OID_AUTO, retries, CTLFLAG_RW, &alk_retries, 0, ""); SYSCTL_UINT(_debug_lockmgr, OID_AUTO, loops, CTLFLAG_RW, &alk_loops, 0, ""); #endif static __inline struct thread * lockmgr_xholder(const struct lock *lk) { uintptr_t x; x = lk->lk_lock; return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x)); } /* * It assumes sleepq_lock held and returns with this one unheld. * It also assumes the generic interlock is sane and previously checked. * If LK_INTERLOCK is specified the interlock is not reacquired after the * sleep. */ static __inline int sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int pri, int timo, int queue) { GIANT_DECLARE; struct lock_class *class; int catch, error; class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; catch = pri & PCATCH; pri &= PRIMASK; error = 0; LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk, (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared"); if (flags & LK_INTERLOCK) class->lc_unlock(ilk); if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0) lk->lk_exslpfail++; GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ? SLEEPQ_INTERRUPTIBLE : 0), queue); - if ((flags & LK_TIMELOCK) && timo) { - sleepq_release(&lk->lock_object); + if ((flags & LK_TIMELOCK) && timo) sleepq_set_timeout(&lk->lock_object, timo); - sleepq_lock(&lk->lock_object); - } + /* * Decisional switch for real sleeping. */ if ((flags & LK_TIMELOCK) && timo && catch) error = sleepq_timedwait_sig(&lk->lock_object, pri); else if ((flags & LK_TIMELOCK) && timo) error = sleepq_timedwait(&lk->lock_object, pri); else if (catch) error = sleepq_wait_sig(&lk->lock_object, pri); else sleepq_wait(&lk->lock_object, pri); GIANT_RESTORE(); if ((flags & LK_SLEEPFAIL) && error == 0) error = ENOLCK; return (error); } static __inline int wakeupshlk(struct lock *lk, const char *file, int line) { uintptr_t v, x; u_int realexslp; int queue, wakeup_swapper; WITNESS_UNLOCK(&lk->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line); wakeup_swapper = 0; for (;;) { x = lk->lk_lock; /* * If there is more than one shared lock held, just drop one * and return. */ if (LK_SHARERS(x) > 1) { if (atomic_cmpset_rel_ptr(&lk->lk_lock, x, x - LK_ONE_SHARER)) break; continue; } /* * If there are not waiters on the exclusive queue, drop the * lock quickly. */ if ((x & LK_ALL_WAITERS) == 0) { MPASS((x & ~LK_EXCLUSIVE_SPINNERS) == LK_SHARERS_LOCK(1)); if (atomic_cmpset_rel_ptr(&lk->lk_lock, x, LK_UNLOCKED)) break; continue; } /* * We should have a sharer with waiters, so enter the hard * path in order to handle wakeups correctly. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); v = LK_UNLOCKED; /* * If the lock has exclusive waiters, give them preference in * order to avoid deadlock with shared runners up. * If interruptible sleeps left the exclusive queue empty * avoid a starvation for the threads sleeping on the shared * queue by giving them precedence and cleaning up the * exclusive waiters bit anyway. * Please note that lk_exslpfail count may be lying about * the real number of waiters with the LK_SLEEPFAIL flag on * because they may be used in conjunction with interruptible * sleeps so lk_exslpfail might be considered an 'upper limit' * bound, including the edge cases. */ realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { if (lk->lk_exslpfail < realexslp) { lk->lk_exslpfail = 0; queue = SQ_EXCLUSIVE_QUEUE; v |= (x & LK_SHARED_WAITERS); } else { lk->lk_exslpfail = 0; LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); queue = SQ_SHARED_QUEUE; } } else { /* * Exclusive waiters sleeping with LK_SLEEPFAIL on * and using interruptible sleeps/timeout may have * left spourious lk_exslpfail counts on, so clean * it up anyway. */ lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; } if (!atomic_cmpset_rel_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x, v)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; } lock_profile_release_lock(&lk->lock_object); TD_LOCKS_DEC(curthread); TD_SLOCKS_DEC(curthread); return (wakeup_swapper); } static void assert_lockmgr(const struct lock_object *lock, int what) { panic("lockmgr locks do not support assertions"); } static void lock_lockmgr(struct lock_object *lock, uintptr_t how) { panic("lockmgr locks do not support sleep interlocking"); } static uintptr_t unlock_lockmgr(struct lock_object *lock) { panic("lockmgr locks do not support sleep interlocking"); } #ifdef KDTRACE_HOOKS static int owner_lockmgr(const struct lock_object *lock, struct thread **owner) { panic("lockmgr locks do not support owner inquiring"); } #endif void lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags) { int iflags; MPASS((flags & ~LK_INIT_MASK) == 0); ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock, ("%s: lockmgr not aligned for %s: %p", __func__, wmesg, &lk->lk_lock)); iflags = LO_SLEEPABLE | LO_UPGRADABLE; if (flags & LK_CANRECURSE) iflags |= LO_RECURSABLE; if ((flags & LK_NODUP) == 0) iflags |= LO_DUPOK; if (flags & LK_NOPROFILE) iflags |= LO_NOPROFILE; if ((flags & LK_NOWITNESS) == 0) iflags |= LO_WITNESS; if (flags & LK_QUIET) iflags |= LO_QUIET; if (flags & LK_IS_VNODE) iflags |= LO_IS_VNODE; iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE); lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags); lk->lk_lock = LK_UNLOCKED; lk->lk_recurse = 0; lk->lk_exslpfail = 0; lk->lk_timo = timo; lk->lk_pri = pri; STACK_ZERO(lk); } /* * XXX: Gross hacks to manipulate external lock flags after * initialization. Used for certain vnode and buf locks. */ void lockallowshare(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags &= ~LK_NOSHARE; } void lockdisableshare(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags |= LK_NOSHARE; } void lockallowrecurse(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags |= LO_RECURSABLE; } void lockdisablerecurse(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags &= ~LO_RECURSABLE; } void lockdestroy(struct lock *lk) { KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held")); KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed")); KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters")); lock_destroy(&lk->lock_object); } int __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int pri, int timo, const char *file, int line) { GIANT_DECLARE; struct lock_class *class; const char *iwmesg; uintptr_t tid, v, x; u_int op, realexslp; int error, ipri, itimo, queue, wakeup_swapper; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif #ifdef ADAPTIVE_LOCKMGRS volatile struct thread *owner; u_int i, spintries = 0; #endif error = 0; tid = (uintptr_t)curthread; op = (flags & LK_TYPE_MASK); iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg; ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri; itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo; MPASS((flags & ~LK_TOTAL_MASK) == 0); KASSERT((op & (op - 1)) == 0, ("%s: Invalid requested operation @ %s:%d", __func__, file, line)); KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 || (op != LK_DOWNGRADE && op != LK_RELEASE), ("%s: Invalid flags in regard of the operation desired @ %s:%d", __func__, file, line)); KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL, ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d", __func__, file, line)); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread, lk->lock_object.lo_name, file, line)); class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; if (panicstr != NULL) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); return (0); } if (lk->lock_object.lo_flags & LK_NOSHARE) { switch (op) { case LK_SHARED: op = LK_EXCLUSIVE; break; case LK_UPGRADE: case LK_TRYUPGRADE: case LK_DOWNGRADE: _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED, file, line); if (flags & LK_INTERLOCK) class->lc_unlock(ilk); return (0); } } wakeup_swapper = 0; switch (op) { case LK_SHARED: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, flags & LK_INTERLOCK ? ilk : NULL); for (;;) { x = lk->lk_lock; /* * If no other thread has an exclusive lock, or * no exclusive waiter is present, bump the count of * sharers. Since we have to preserve the state of * waiters, if we fail to acquire the shared lock * loop back and retry. */ if (LK_CAN_SHARE(x, flags)) { if (atomic_cmpset_acq_ptr(&lk->lk_lock, x, x + LK_ONE_SHARER)) break; continue; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is already held by curthread in * exclusive way avoid a deadlock. */ if (LK_HOLDER(x) == tid) { LOCK_LOG2(lk, "%s: %p already held in exclusive mode", __func__, lk); error = EDEADLK; break; } /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } #ifdef ADAPTIVE_LOCKMGRS /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. We need a double-state handle here * because for a failed acquisition the lock can be * either held in exclusive mode or shared mode * (for the writer starvation avoidance technique). */ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 && LK_HOLDER(x) != LK_KERNPROC) { owner = (struct thread *)LK_HOLDER(x); if (LOCK_LOG_TEST(&lk->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, lk, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "spinning", "lockname:\"%s\"", lk->lock_object.lo_name); /* * If we are holding also an interlock drop it * in order to avoid a deadlock if the lockmgr * owner is adaptively spinning on the * interlock itself. */ if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); while (LK_HOLDER(lk->lk_lock) == (uintptr_t)owner && TD_IS_RUNNING(owner)) cpu_spinwait(); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(td), "running"); GIANT_RESTORE(); continue; } else if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) != 0 && LK_SHARERS(x) && spintries < alk_retries) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "spinning", "lockname:\"%s\"", lk->lock_object.lo_name); if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); spintries++; for (i = 0; i < alk_loops; i++) { if (LOCK_LOG_TEST(&lk->lock_object, 0)) CTR4(KTR_LOCK, "%s: shared spinning on %p with %u and %u", __func__, lk, spintries, i); x = lk->lk_lock; if ((x & LK_SHARE) == 0 || LK_CAN_SHARE(x, flags) != 0) break; cpu_spinwait(); } KTR_STATE0(KTR_SCHED, "thread", sched_tdname(td), "running"); GIANT_RESTORE(); if (i != alk_loops) continue; } #endif /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock can be acquired in shared mode, try * again. */ if (LK_CAN_SHARE(x, flags)) { sleepq_release(&lk->lock_object); continue; } #ifdef ADAPTIVE_LOCKMGRS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owner) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 && LK_HOLDER(x) != LK_KERNPROC) { owner = (struct thread *)LK_HOLDER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&lk->lock_object); continue; } } #endif /* * Try to set the LK_SHARED_WAITERS flag. If we fail, * loop back and retry. */ if ((x & LK_SHARED_WAITERS) == 0) { if (!atomic_cmpset_acq_ptr(&lk->lk_lock, x, x | LK_SHARED_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set shared waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * shared lock and the shared waiters flag is set, * we will sleep. */ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, SQ_SHARED_QUEUE); flags &= ~LK_INTERLOCK; if (error) { LOCK_LOG3(lk, "%s: interrupted sleep for %p with %d", __func__, lk, error); break; } LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file, line); WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); TD_SLOCKS_INC(curthread); STACK_SAVE(lk); } break; case LK_UPGRADE: case LK_TRYUPGRADE: _lockmgr_assert(lk, KA_SLOCKED, file, line); v = lk->lk_lock; x = v & LK_ALL_WAITERS; v &= LK_EXCLUSIVE_SPINNERS; /* * Try to switch from one shared lock to an exclusive one. * We need to preserve waiters flags during the operation. */ if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v, tid | x)) { LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_SLOCKS_DEC(curthread); break; } /* * In LK_TRYUPGRADE mode, do not drop the lock, * returning EBUSY instead. */ if (op == LK_TRYUPGRADE) { LOCK_LOG2(lk, "%s: %p failed the nowait upgrade", __func__, lk); error = EBUSY; break; } /* * We have been unable to succeed in upgrading, so just * give up the shared lock. */ wakeup_swapper |= wakeupshlk(lk, file, line); /* FALLTHROUGH */ case LK_EXCLUSIVE: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); /* * If curthread already holds the lock and this one is * allowed to recurse, simply recurse on it. */ if (lockmgr_xlocked(lk)) { if ((flags & LK_CANRECURSE) == 0 && (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) { /* * If the lock is expected to not panic just * give up and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: recursing on non recursive lockmgr %s @ %s:%d\n", __func__, iwmesg, file, line); } lk->lk_recurse++; LOCK_LOG2(lk, "%s: %p recursing", __func__, lk); LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); break; } for (;;) { if (lk->lk_lock == LK_UNLOCKED && atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) break; #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } #ifdef ADAPTIVE_LOCKMGRS /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ x = lk->lk_lock; if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 && LK_HOLDER(x) != LK_KERNPROC) { owner = (struct thread *)LK_HOLDER(x); if (LOCK_LOG_TEST(&lk->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, lk, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "spinning", "lockname:\"%s\"", lk->lock_object.lo_name); /* * If we are holding also an interlock drop it * in order to avoid a deadlock if the lockmgr * owner is adaptively spinning on the * interlock itself. */ if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); while (LK_HOLDER(lk->lk_lock) == (uintptr_t)owner && TD_IS_RUNNING(owner)) cpu_spinwait(); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(td), "running"); GIANT_RESTORE(); continue; } else if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) != 0 && LK_SHARERS(x) && spintries < alk_retries) { if ((x & LK_EXCLUSIVE_SPINNERS) == 0 && !atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_SPINNERS)) continue; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "spinning", "lockname:\"%s\"", lk->lock_object.lo_name); if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); spintries++; for (i = 0; i < alk_loops; i++) { if (LOCK_LOG_TEST(&lk->lock_object, 0)) CTR4(KTR_LOCK, "%s: shared spinning on %p with %u and %u", __func__, lk, spintries, i); if ((lk->lk_lock & LK_EXCLUSIVE_SPINNERS) == 0) break; cpu_spinwait(); } KTR_STATE0(KTR_SCHED, "thread", sched_tdname(td), "running"); GIANT_RESTORE(); if (i != alk_loops) continue; } #endif /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock has been released while we spun on * the sleepqueue chain lock just try again. */ if (x == LK_UNLOCKED) { sleepq_release(&lk->lock_object); continue; } #ifdef ADAPTIVE_LOCKMGRS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owner) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 && LK_HOLDER(x) != LK_KERNPROC) { owner = (struct thread *)LK_HOLDER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&lk->lock_object); continue; } } #endif /* * The lock can be in the state where there is a * pending queue of waiters, but still no owner. * This happens when the lock is contested and an * owner is going to claim the lock. * If curthread is the one successfully acquiring it * claim lock ownership and return, preserving waiters * flags. */ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v &= ~LK_EXCLUSIVE_SPINNERS; if (atomic_cmpset_acq_ptr(&lk->lk_lock, x, tid | v)) { sleepq_release(&lk->lock_object); LOCK_LOG2(lk, "%s: %p claimed by a new writer", __func__, lk); break; } sleepq_release(&lk->lock_object); continue; } /* * Try to set the LK_EXCLUSIVE_WAITERS flag. If we * fail, loop back and retry. */ if ((x & LK_EXCLUSIVE_WAITERS) == 0) { if (!atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set excl waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * exclusive lock and the exclusive waiters flag * is set, we will sleep. */ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, SQ_EXCLUSIVE_QUEUE); flags &= ~LK_INTERLOCK; if (error) { LOCK_LOG3(lk, "%s: interrupted sleep for %p with %d", __func__, lk, error); break; } LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); STACK_SAVE(lk); } break; case LK_DOWNGRADE: _lockmgr_assert(lk, KA_XLOCKED, file, line); LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line); /* * Panic if the lock is recursed. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n", __func__, iwmesg, file, line); } TD_SLOCKS_INC(curthread); /* * In order to preserve waiters flags, just spin. */ for (;;) { x = lk->lk_lock; MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); x &= LK_ALL_WAITERS; if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, LK_SHARERS_LOCK(1) | x)) break; cpu_spinwait(); } break; case LK_RELEASE: _lockmgr_assert(lk, KA_LOCKED, file, line); x = lk->lk_lock; if ((x & LK_SHARE) == 0) { /* * As first option, treact the lock as if it has not * any waiter. * Fix-up the tid var if the lock has been disowned. */ if (LK_HOLDER(x) == LK_KERNPROC) tid = LK_KERNPROC; else { WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); } LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); /* * The lock is held in exclusive mode. * If the lock is recursed also, then unrecurse it. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { LOCK_LOG2(lk, "%s: %p unrecursing", __func__, lk); lk->lk_recurse--; break; } if (tid != LK_KERNPROC) lock_profile_release_lock(&lk->lock_object); if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) break; sleepq_lock(&lk->lock_object); x = lk->lk_lock; v = LK_UNLOCKED; /* * If the lock has exclusive waiters, give them * preference in order to avoid deadlock with * shared runners up. * If interruptible sleeps left the exclusive queue * empty avoid a starvation for the threads sleeping * on the shared queue by giving them precedence * and cleaning up the exclusive waiters bit anyway. * Please note that lk_exslpfail count may be lying * about the real number of waiters with the * LK_SLEEPFAIL flag on because they may be used in * conjunction with interruptible sleeps so * lk_exslpfail might be considered an 'upper limit' * bound, including the edge cases. */ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { if (lk->lk_exslpfail < realexslp) { lk->lk_exslpfail = 0; queue = SQ_EXCLUSIVE_QUEUE; v |= (x & LK_SHARED_WAITERS); } else { lk->lk_exslpfail = 0; LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); queue = SQ_SHARED_QUEUE; } } else { /* * Exclusive waiters sleeping with LK_SLEEPFAIL * on and using interruptible sleeps/timeout * may have left spourious lk_exslpfail counts * on, so clean it up anyway. */ lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; } LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); atomic_store_rel_ptr(&lk->lk_lock, v); wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; } else wakeup_swapper = wakeupshlk(lk, file, line); break; case LK_DRAIN: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); /* * Trying to drain a lock we already own will result in a * deadlock. */ if (lockmgr_xlocked(lk)) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: draining %s with the lock held @ %s:%d\n", __func__, iwmesg, file, line); } for (;;) { if (lk->lk_lock == LK_UNLOCKED && atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) break; #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock has been released while we spun on * the sleepqueue chain lock just try again. */ if (x == LK_UNLOCKED) { sleepq_release(&lk->lock_object); continue; } v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v = (x & ~LK_EXCLUSIVE_SPINNERS); /* * If interruptible sleeps left the exclusive * queue empty avoid a starvation for the * threads sleeping on the shared queue by * giving them precedence and cleaning up the * exclusive waiters bit anyway. * Please note that lk_exslpfail count may be * lying about the real number of waiters with * the LK_SLEEPFAIL flag on because they may * be used in conjunction with interruptible * sleeps so lk_exslpfail might be considered * an 'upper limit' bound, including the edge * cases. */ if (v & LK_EXCLUSIVE_WAITERS) { queue = SQ_EXCLUSIVE_QUEUE; v &= ~LK_EXCLUSIVE_WAITERS; } else { /* * Exclusive waiters sleeping with * LK_SLEEPFAIL on and using * interruptible sleeps/timeout may * have left spourious lk_exslpfail * counts on, so clean it up anyway. */ MPASS(v & LK_SHARED_WAITERS); lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; } if (queue == SQ_EXCLUSIVE_QUEUE) { realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if (lk->lk_exslpfail >= realexslp) { lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; if (realexslp != 0) { LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast( &lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); } } else lk->lk_exslpfail = 0; } if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG3(lk, "%s: %p waking up all threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper |= sleepq_broadcast( &lk->lock_object, SLEEPQ_LK, 0, queue); /* * If shared waiters have been woken up we need * to wait for one of them to acquire the lock * before to set the exclusive waiters in * order to avoid a deadlock. */ if (queue == SQ_SHARED_QUEUE) { for (v = lk->lk_lock; (v & LK_SHARE) && !LK_SHARERS(v); v = lk->lk_lock) cpu_spinwait(); } } /* * Try to set the LK_EXCLUSIVE_WAITERS flag. If we * fail, loop back and retry. */ if ((x & LK_EXCLUSIVE_WAITERS) == 0) { if (!atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set drain waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * exclusive lock and the exclusive waiters flag * is set, we will sleep. */ if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK, SQ_EXCLUSIVE_QUEUE); sleepq_wait(&lk->lock_object, ipri & PRIMASK); GIANT_RESTORE(); LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); STACK_SAVE(lk); } break; default: if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: unknown lockmgr request 0x%x\n", __func__, op); } if (flags & LK_INTERLOCK) class->lc_unlock(ilk); if (wakeup_swapper) kick_proc0(); return (error); } void _lockmgr_disown(struct lock *lk, const char *file, int line) { uintptr_t tid, x; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; _lockmgr_assert(lk, KA_XLOCKED, file, line); /* * Panic if the lock is recursed. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) panic("%s: disown a recursed lockmgr @ %s:%d\n", __func__, file, line); /* * If the owner is already LK_KERNPROC just skip the whole operation. */ if (LK_HOLDER(lk->lk_lock) != tid) return; lock_profile_release_lock(&lk->lock_object); LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line); WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); STACK_SAVE(lk); /* * In order to preserve waiters flags, just spin. */ for (;;) { x = lk->lk_lock; MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); x &= LK_ALL_WAITERS; if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, LK_KERNPROC | x)) return; cpu_spinwait(); } } void lockmgr_printinfo(const struct lock *lk) { struct thread *td; uintptr_t x; if (lk->lk_lock == LK_UNLOCKED) printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name); else if (lk->lk_lock & LK_SHARE) printf("lock type %s: SHARED (count %ju)\n", lk->lock_object.lo_name, (uintmax_t)LK_SHARERS(lk->lk_lock)); else { td = lockmgr_xholder(lk); if (td == (struct thread *)LK_KERNPROC) printf("lock type %s: EXCL by KERNPROC\n", lk->lock_object.lo_name); else printf("lock type %s: EXCL by thread %p " "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name, td, td->td_proc->p_pid, td->td_proc->p_comm, td->td_tid); } x = lk->lk_lock; if (x & LK_EXCLUSIVE_WAITERS) printf(" with exclusive waiters pending\n"); if (x & LK_SHARED_WAITERS) printf(" with shared waiters pending\n"); if (x & LK_EXCLUSIVE_SPINNERS) printf(" with exclusive spinners pending\n"); STACK_PRINT(lk); } int lockstatus(const struct lock *lk) { uintptr_t v, x; int ret; ret = LK_SHARED; x = lk->lk_lock; v = LK_HOLDER(x); if ((x & LK_SHARE) == 0) { if (v == (uintptr_t)curthread || v == LK_KERNPROC) ret = LK_EXCLUSIVE; else ret = LK_EXCLOTHER; } else if (x == LK_UNLOCKED) ret = 0; return (ret); } #ifdef INVARIANT_SUPPORT FEATURE(invariant_support, "Support for modules compiled with INVARIANTS option"); #ifndef INVARIANTS #undef _lockmgr_assert #endif void _lockmgr_assert(const struct lock *lk, int what, const char *file, int line) { int slocked = 0; if (panicstr != NULL) return; switch (what) { case KA_SLOCKED: case KA_SLOCKED | KA_NOTRECURSED: case KA_SLOCKED | KA_RECURSED: slocked = 1; case KA_LOCKED: case KA_LOCKED | KA_NOTRECURSED: case KA_LOCKED | KA_RECURSED: #ifdef WITNESS /* * We cannot trust WITNESS if the lock is held in exclusive * mode and a call to lockmgr_disown() happened. * Workaround this skipping the check if the lock is held in * exclusive mode even for the KA_LOCKED case. */ if (slocked || (lk->lk_lock & LK_SHARE)) { witness_assert(&lk->lock_object, what, file, line); break; } #endif if (lk->lk_lock == LK_UNLOCKED || ((lk->lk_lock & LK_SHARE) == 0 && (slocked || (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))))) panic("Lock %s not %slocked @ %s:%d\n", lk->lock_object.lo_name, slocked ? "share" : "", file, line); if ((lk->lk_lock & LK_SHARE) == 0) { if (lockmgr_recursed(lk)) { if (what & KA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } else if (what & KA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } break; case KA_XLOCKED: case KA_XLOCKED | KA_NOTRECURSED: case KA_XLOCKED | KA_RECURSED: if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)) panic("Lock %s not exclusively locked @ %s:%d\n", lk->lock_object.lo_name, file, line); if (lockmgr_recursed(lk)) { if (what & KA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } else if (what & KA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); break; case KA_UNLOCKED: if (lockmgr_xlocked(lk) || lockmgr_disowned(lk)) panic("Lock %s exclusively locked @ %s:%d\n", lk->lock_object.lo_name, file, line); break; default: panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file, line); } } #endif #ifdef DDB int lockmgr_chain(struct thread *td, struct thread **ownerp) { struct lock *lk; lk = td->td_wchan; if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr) return (0); db_printf("blocked on lockmgr %s", lk->lock_object.lo_name); if (lk->lk_lock & LK_SHARE) db_printf("SHARED (count %ju)\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); else db_printf("EXCL\n"); *ownerp = lockmgr_xholder(lk); return (1); } static void db_show_lockmgr(const struct lock_object *lock) { struct thread *td; const struct lock *lk; lk = (const struct lock *)lock; db_printf(" state: "); if (lk->lk_lock == LK_UNLOCKED) db_printf("UNLOCKED\n"); else if (lk->lk_lock & LK_SHARE) db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); else { td = lockmgr_xholder(lk); if (td == (struct thread *)LK_KERNPROC) db_printf("XLOCK: LK_KERNPROC\n"); else db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm); if (lockmgr_recursed(lk)) db_printf(" recursed: %d\n", lk->lk_recurse); } db_printf(" waiters: "); switch (lk->lk_lock & LK_ALL_WAITERS) { case LK_SHARED_WAITERS: db_printf("shared\n"); break; case LK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case LK_ALL_WAITERS: db_printf("shared and exclusive\n"); break; default: db_printf("none\n"); } db_printf(" spinners: "); if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS) db_printf("exclusive\n"); else db_printf("none\n"); } #endif Index: projects/hps_head/sys/kern/kern_synch.c =================================================================== --- projects/hps_head/sys/kern/kern_synch.c (revision 303925) +++ projects/hps_head/sys/kern/kern_synch.c (revision 303926) @@ -1,601 +1,594 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL); int hogticks; static uint8_t pause_wchan[MAXCPU]; static struct callout loadav_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE, ""); static void loadav(void *arg); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , preempt); static void sleepinit(void *unused) { hogticks = (hz / 10) * 2; /* Default only. */ init_sleepqueues(); } /* * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure * it is available. */ SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, 0); /* * General sleep call. Suspends the current thread until a wakeup is * performed on the specified identifier. The thread will then be made * runnable with the specified priority. Sleeps at most sbt units of time * (0 means no timeout). If pri includes the PCATCH flag, let signals * interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal becomes pending, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The lock argument is unlocked before the caller is suspended, and * re-locked before _sleep() returns. If priority includes the PDROP * flag the lock is not re-locked before returning. */ int _sleep(void *ident, struct lock_object *lock, int priority, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct proc *p; struct lock_class *class; uintptr_t lock_state; int catch, pri, rval, sleepq_flags; WITNESS_SAVE_DECL(lock_witness); td = curthread; p = td->td_proc; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, wmesg); #endif WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Sleeping on \"%s\"", wmesg); KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL, ("sleeping without a lock")); KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); if (priority & PDROP) KASSERT(lock != NULL && lock != &Giant.lock_object, ("PDROP requires a non-Giant lock")); if (lock != NULL) class = LOCK_CLASS(lock); else class = NULL; if (SCHEDULER_STOPPED()) { if (lock != NULL && priority & PDROP) class->lc_unlock(lock); return (0); } catch = priority & PCATCH; pri = priority & PRIMASK; /* * If we are already on a sleep queue, then remove us from that * sleep queue first. We have to do this to handle recursive * sleeps. */ if (TD_ON_SLEEPQ(td)) sleepq_remove(td, td->td_wchan); if ((uint8_t *)ident >= &pause_wchan[0] && (uint8_t *)ident <= &pause_wchan[MAXCPU - 1]) sleepq_flags = SLEEPQ_PAUSE; else sleepq_flags = SLEEPQ_SLEEP; if (catch) sleepq_flags |= SLEEPQ_INTERRUPTIBLE; sleepq_lock(ident); CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, p->p_pid, td->td_name, wmesg, ident); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(class->lc_flags & LC_SLEEPABLE)) { WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); } else /* GCC needs to follow the Yellow Brick Road */ lock_state = -1; /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, * and a wakeup or a SIGCONT (or both) could occur while we were * stopped without resuming us. Thus, we must be ready for sleep * when cursig() is called. If the wakeup happens while we're * stopped, then td will no longer be on a sleep queue upon * return from cursig(). */ sleepq_add(ident, lock, wmesg, sleepq_flags, 0); + if (sbt != 0) + sleepq_set_timeout_sbt(ident, sbt, pr, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); - if (sbt != 0) - sleepq_set_timeout_sbt(ident, sbt, pr, flags); sleepq_lock(ident); - } else if (sbt != 0) { - sleepq_release(ident); - sleepq_set_timeout_sbt(ident, sbt, pr, flags); - sleepq_lock(ident); } if (sbt != 0 && catch) rval = sleepq_timedwait_sig(ident, pri); else if (sbt != 0) rval = sleepq_timedwait(ident, pri); else if (catch) rval = sleepq_wait_sig(ident, pri); else { sleepq_wait(ident, pri); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } int msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct proc *p; int rval; WITNESS_SAVE_DECL(mtx); td = curthread; p = td->td_proc; KASSERT(mtx != NULL, ("sleeping without a mutex")); KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); if (SCHEDULER_STOPPED()) return (0); sleepq_lock(ident); CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, p->p_pid, td->td_name, wmesg, ident); DROP_GIANT(); mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->lock_object, mtx); mtx_unlock_spin(mtx); /* * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); - if (sbt != 0) { - sleepq_release(ident); + if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); - sleepq_lock(ident); - } /* * Can't call ktrace with any spin locks held so it can lock the * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold * any spin lock. Thus, we have to drop the sleepq spin lock while * we handle those requests. This is safe since we have placed our * thread on the sleep queue already. */ #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { sleepq_release(ident); ktrcsw(1, 0, wmesg); sleepq_lock(ident); } #endif #ifdef WITNESS sleepq_release(ident); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"", wmesg); sleepq_lock(ident); #endif if (sbt != 0) rval = sleepq_timedwait(ident, 0); else { sleepq_wait(ident, 0); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); mtx_lock_spin(mtx); WITNESS_RESTORE(&mtx->lock_object, mtx); return (rval); } /* * pause() delays the calling thread by the given number of system ticks. * During cold bootup, pause() uses the DELAY() function instead of * the tsleep() function to do the waiting. The "timo" argument must be * greater than or equal to zero. A "timo" value of zero is equivalent * to a "timo" value of one. */ int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { KASSERT(sbt >= 0, ("pause: timeout must be >= 0")); /* silently convert invalid timeouts */ if (sbt == 0) sbt = tick_sbt; if (cold || kdb_active || SCHEDULER_STOPPED()) { /* * We delay one second at a time to avoid overflowing the * system specific DELAY() function(s): */ while (sbt >= SBT_1S) { DELAY(1000000); sbt -= SBT_1S; } /* Do the delay remainder, if any */ sbt = howmany(sbt, SBT_1US); if (sbt > 0) DELAY(sbt); return (0); } return (_sleep(&pause_wchan[curcpu], NULL, 0, wmesg, sbt, pr, flags)); } /* * Make all threads sleeping on the specified identifier runnable. */ void wakeup(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) { KASSERT(ident != &proc0, ("wakeup and wakeup_swapper and proc0")); kick_proc0(); } } /* * Make a thread sleeping on the specified identifier runnable. * May wake more than one thread if a target thread is currently * swapped out. */ void wakeup_one(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) kick_proc0(); } static void kdb_switch(void) { thread_unlock(curthread); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } /* * The machine independent parts of context switching. */ void mi_switch(int flags, struct thread *newtd) { uint64_t runtime, new_switchtime; struct thread *td; td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif KASSERT(td->td_critnest == 1 || panicstr, ("mi_switch: switch in a critical section")); KASSERT((flags & (SW_INVOL | SW_VOL)) != 0, ("mi_switch: switch must be voluntary or involuntary")); KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself")); /* * Don't perform context switches from the debugger. */ if (kdb_active) kdb_switch(); if (SCHEDULER_STOPPED()) return; if (flags & SW_VOL) { td->td_ru.ru_nvcsw++; td->td_swvoltick = ticks; } else { td->td_ru.ru_nivcsw++; td->td_swinvoltick = ticks; } #ifdef SCHED_STATS SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); #endif /* * Compute the amount of time during which the current * thread was running, and add that to its total so far. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); td->td_generation++; /* bump preempt-detect counter */ PCPU_INC(cnt.v_swtch); PCPU_SET(switchticks, ticks); CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif SDT_PROBE0(sched, , , preempt); sched_switch(td, newtd, flags); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } } /* * Change thread state to be runnable, placing it on the run queue if * it is in memory. If it is swapped out, return true so our caller * will know to awaken the swapper. */ int setrunnable(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td->td_proc->p_state != PRS_ZOMBIE, ("setrunnable: pid %d is a zombie", td->td_proc->p_pid)); switch (td->td_state) { case TDS_RUNNING: case TDS_RUNQ: return (0); case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * then arange to swap in this process. Otherwise just return. */ if (td->td_inhibitors != TDI_SWAPPED) return (0); /* FALLTHROUGH */ case TDS_CAN_RUN: break; default: printf("state is 0x%x", td->td_state); panic("setrunnable(2)"); } if ((td->td_flags & TDF_INMEM) == 0) { if ((td->td_flags & TDF_SWAPINREQ) == 0) { td->td_flags |= TDF_SWAPINREQ; return (1); } } else sched_wakeup(td); return (0); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. */ static void loadav(void *arg) { int i, nrun; struct loadavg *avg; nrun = sched_load(); avg = &averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset_sbt(&loadav_callout, SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US, loadav, NULL, C_DIRECT_EXEC | C_PREL(32)); } /* ARGSUSED */ static void synch_setup(void *dummy) { callout_init(&loadav_callout, 1); /* Kick off timeout driven events by calling first time. */ loadav(NULL); } int should_yield(void) { return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks); } void maybe_yield(void) { if (should_yield()) kern_yield(PRI_USER); } void kern_yield(int prio) { struct thread *td; td = curthread; DROP_GIANT(); thread_lock(td); if (prio == PRI_USER) prio = td->td_user_pri; if (prio >= 0) sched_prio(td, prio); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); PICKUP_GIANT(); } /* * General purpose yield system call. */ int sys_yield(struct thread *td, struct yield_args *uap) { thread_lock(td); if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); td->td_retval[0] = 0; return (0); } Index: projects/hps_head/sys/kern/kern_thread.c =================================================================== --- projects/hps_head/sys/kern/kern_thread.c (revision 303925) +++ projects/hps_head/sys/kern/kern_thread.c (revision 303926) @@ -1,1209 +1,1205 @@ /*- * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); static struct mtx zombie_lock; MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); static void thread_zombie(struct thread *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); #define TID_BUFFER_SIZE 1024 struct mtx tid_lock; static struct unrhdr *tid_unrhdr; static lwpid_t tid_buffer[TID_BUFFER_SIZE]; static int tid_head, tid_tail; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); struct tidhashhead *tidhashtbl; u_long tidhash; struct rwlock tidhash_lock; static lwpid_t tid_alloc(void) { lwpid_t tid; tid = alloc_unr(tid_unrhdr); if (tid != -1) return (tid); mtx_lock(&tid_lock); if (tid_head == tid_tail) { mtx_unlock(&tid_lock); return (-1); } tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); return (tid); } static void tid_free(lwpid_t tid) { lwpid_t tmp_tid = -1; mtx_lock(&tid_lock); if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) { tmp_tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; } tid_buffer[tid_tail] = tid; tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); if (tmp_tid != -1) free_unr(tid_unrhdr, tmp_tid); } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_tid = tid_alloc(); /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; EVENTHANDLER_INVOKE(thread_ctor, td); #ifdef AUDIT audit_thread_alloc(td); #endif umtx_thread_alloc(td); - - mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN); - callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; - /* make sure to drain any use of the "td->td_slpcallout" */ - callout_drain(&td->td_slpcallout); - mtx_destroy(&td->td_slpmutex); - #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); EVENTHANDLER_INVOKE(thread_dtor, td); tid_free(td->td_tid); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_INVOKE(thread_init, td); umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); seltdfini(td); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } /* * Initialize global thread allocation resources. */ void threadinit(void) { mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); /* * pid_max cannot be greater than PID_MAX. * leave one number for thread0. */ tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock); thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 16 - 1, UMA_ZONE_NOFREE); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); rw_init(&tidhash_lock, "tidhash"); } /* * Place an unused thread on the zombie list. * Use the slpq as that must be unused by now. */ void thread_zombie(struct thread *td) { mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); mtx_unlock_spin(&zombie_lock); } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombie resources. */ void thread_reap(void) { struct thread *td_first, *td_next; /* * Don't even bother to lock if none at this instant, - * we really don't care about the next instant.. + * we really don't care about the next instant. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); thread_cow_free(td_first); thread_free(td_first); td_first = td_next; } } } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; thread_reap(); /* check if any zombies to get */ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); return (NULL); } cpu_thread_alloc(td); vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); vm_domain_policy_cleanup(&td->td_vm_dom_policy); + callout_drain(&td->td_slpcallout); uma_zfree(thread_zone, td); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { newtd->td_ucred = crhold(td->td_ucred); newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_ucred != NULL) crfree(td->td_ucred); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldcred = NULL; oldlimit = NULL; PROC_LOCK(p); if (td->td_ucred != p->p_ucred) { oldcred = td->td_ucred; td->td_ucred = crhold(p->p_ucred); } if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); PCPU_INC(cnt.v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); td->td_state = TDS_INACTIVE; #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); + callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); + callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) wakeup_swapper |= thread_unsuspend_one(td2, p, true); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, EINTR); break; case SINGLE_BOUNDARY: case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); } } break; } return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); #endif } thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * It is safe to access p->p_singlethread unlocked * because it can only be set to our address by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND, NULL); thread_unlock(td); PROC_LOCK(p); } return (0); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td)); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } struct thread * thread_find(struct proc *p, lwpid_t tid) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } return (td); } /* Locate a thread by number; return with proc lock held. */ struct thread * tdfind(lwpid_t tid, pid_t pid) { #define RUN_THRESH 16 struct thread *td; int run = 0; rw_rlock(&tidhash_lock); LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) { td = NULL; break; } PROC_LOCK(td->td_proc); if (td->td_proc->p_state == PRS_NEW) { PROC_UNLOCK(td->td_proc); td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(&tidhash_lock)) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); return (td); } } break; } run++; } rw_runlock(&tidhash_lock); return (td); } void tidhash_add(struct thread *td) { rw_wlock(&tidhash_lock); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); } void tidhash_remove(struct thread *td) { rw_wlock(&tidhash_lock); LIST_REMOVE(td, td_hash); rw_wunlock(&tidhash_lock); } Index: projects/hps_head/sys/kern/subr_sleepqueue.c =================================================================== --- projects/hps_head/sys/kern/subr_sleepqueue.c (revision 303925) +++ projects/hps_head/sys/kern/subr_sleepqueue.c (revision 303926) @@ -1,1357 +1,1364 @@ /*- * Copyright (c) 2004 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Implementation of sleep queues used to hold queue of threads blocked on * a wait channel. Sleep queues different from turnstiles in that wait * channels are not owned by anyone, so there is no priority propagation. * Sleep queues can also provide a timeout and can also be interrupted by * signals. That said, there are several similarities between the turnstile * and sleep queue implementations. (Note: turnstiles were implemented * first.) For example, both use a hash table of the same size where each * bucket is referred to as a "chain" that contains both a spin lock and * a linked list of queues. An individual queue is located by using a hash * to pick a chain, locking the chain, and then walking the chain searching * for the queue. This means that a wait channel object does not need to * embed it's queue head just as locks do not embed their turnstile queue * head. Threads also carry around a sleep queue that they lend to the * wait channel when blocking. Just as in turnstiles, the queue includes * a free list of the sleep queues of other threads blocked on the same * wait channel in the case of multiple waiters. * * Some additional functionality provided by sleep queues include the * ability to set a timeout. The timeout is managed using a per-thread * callout that resumes a thread if it is asleep. A thread may also * catch signals while it is asleep (aka an interruptible sleep). The * signal code uses sleepq_abort() to interrupt a sleeping thread. Finally, * sleep queues also provide some extra assertions. One is not allowed to * mix the sleep/wakeup and cv APIs for a given wait channel. Also, one * must consistently use the same lock to synchronize with a wait channel, * though this check is currently only a warning for sleep/wakeup due to * pre-existing abuse of that API. The same lock must also be held when * awakening threads, though that is currently only enforced for condition * variables. */ #include __FBSDID("$FreeBSD$"); #include "opt_sleepqueue_profiling.h" #include "opt_ddb.h" #include "opt_sched.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* * Constants for the hash table of sleep queue chains. * SC_TABLESIZE must be a power of two for SC_MASK to work properly. */ #define SC_TABLESIZE 256 /* Must be power of 2. */ #define SC_MASK (SC_TABLESIZE - 1) #define SC_SHIFT 8 #define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \ SC_MASK) #define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)] #define NR_SLEEPQS 2 /* * There two different lists of sleep queues. Both lists are connected * via the sq_hash entries. The first list is the sleep queue chain list * that a sleep queue is on when it is attached to a wait channel. The * second list is the free list hung off of a sleep queue that is attached * to a wait channel. * * Each sleep queue also contains the wait channel it is attached to, the * list of threads blocked on that wait channel, flags specific to the * wait channel, and the lock used to synchronize with a wait channel. * The flags are used to catch mismatches between the various consumers * of the sleep queue API (e.g. sleep/wakeup and condition variables). * The lock pointer is only used when invariants are enabled for various * debugging checks. * * Locking key: * c - sleep queue chain lock */ struct sleepqueue { TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */ void *sq_wchan; /* (c) Wait channel. */ int sq_type; /* (c) Queue type. */ #ifdef INVARIANTS struct lock_object *sq_lock; /* (c) Associated lock. */ #endif }; struct sleepqueue_chain { LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */ struct mtx sc_lock; /* Spin lock for this chain. */ #ifdef SLEEPQUEUE_PROFILING u_int sc_depth; /* Length of sc_queues. */ u_int sc_max_depth; /* Max length of sc_queues. */ #endif }; #ifdef SLEEPQUEUE_PROFILING u_int sleepq_max_depth; static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling"); static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0, "sleepq chain stats"); SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth, 0, "maxmimum depth achieved of a single chain"); static void sleepq_profile(const char *wmesg); static int prof_enabled; #endif static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE]; static uma_zone_t sleepq_zone; /* * Prototypes for non-exported routines. */ static int sleepq_catch_signals(void *wchan, int pri); static int sleepq_check_signals(void); -static int sleepq_check_timeout(struct thread *); -static void sleepq_stop_timeout(struct thread *); +static int sleepq_check_timeout(void); #ifdef INVARIANTS static void sleepq_dtor(void *mem, int size, void *arg); #endif static int sleepq_init(void *mem, int size, int flags); static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri); static void sleepq_switch(void *wchan, int pri); static void sleepq_timeout(void *arg); SDT_PROBE_DECLARE(sched, , , sleep); SDT_PROBE_DECLARE(sched, , , wakeup); /* * Initialize SLEEPQUEUE_PROFILING specific sysctl nodes. * Note that it must happen after sleepinit() has been fully executed, so * it must happen after SI_SUB_KMEM SYSINIT() subsystem setup. */ #ifdef SLEEPQUEUE_PROFILING static void init_sleepqueue_profiling(void) { char chain_name[10]; struct sysctl_oid *chain_oid; u_int i; for (i = 0; i < SC_TABLESIZE; i++) { snprintf(chain_name, sizeof(chain_name), "%u", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO, chain_name, CTLFLAG_RD, NULL, "sleepq chain stats"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0, NULL); } } SYSINIT(sleepqueue_profiling, SI_SUB_LOCK, SI_ORDER_ANY, init_sleepqueue_profiling, NULL); #endif /* * Early initialization of sleep queues that is called from the sleepinit() * SYSINIT. */ void init_sleepqueues(void) { int i; for (i = 0; i < SC_TABLESIZE; i++) { LIST_INIT(&sleepq_chains[i].sc_queues); mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL, MTX_SPIN | MTX_RECURSE); } sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue), #ifdef INVARIANTS NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0); #else NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0); #endif thread0.td_sleepqueue = sleepq_alloc(); } /* * Get a sleep queue for a new thread. */ struct sleepqueue * sleepq_alloc(void) { return (uma_zalloc(sleepq_zone, M_WAITOK)); } /* * Free a sleep queue when a thread is destroyed. */ void sleepq_free(struct sleepqueue *sq) { uma_zfree(sleepq_zone, sq); } /* * Lock the sleep queue chain associated with the specified wait channel. */ void sleepq_lock(void *wchan) { struct sleepqueue_chain *sc; sc = SC_LOOKUP(wchan); mtx_lock_spin(&sc->sc_lock); } /* * Look up the sleep queue associated with a given wait channel in the hash * table locking the associated sleep queue chain. If no queue is found in * the table, NULL is returned. */ struct sleepqueue * sleepq_lookup(void *wchan) { struct sleepqueue_chain *sc; struct sleepqueue *sq; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) if (sq->sq_wchan == wchan) return (sq); return (NULL); } /* * Unlock the sleep queue chain associated with a given wait channel. */ void sleepq_release(void *wchan) { struct sleepqueue_chain *sc; sc = SC_LOOKUP(wchan); mtx_unlock_spin(&sc->sc_lock); } /* * Places the current thread on the sleep queue for the specified wait * channel. If INVARIANTS is enabled, then it associates the passed in * lock with the sleepq to make sure it is held when that sleep queue is * woken up. */ void sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags, int queue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(td->td_sleepqueue != NULL); MPASS(wchan != NULL); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); /* If this thread is not allowed to sleep, die a horrible death. */ KASSERT(td->td_no_sleeping == 0, ("%s: td %p to sleep on wchan %p with sleeping prohibited", __func__, td, wchan)); /* Look up the sleep queue associated with the wait channel 'wchan'. */ sq = sleepq_lookup(wchan); /* * If the wait channel does not already have a sleep queue, use * this thread's sleep queue. Otherwise, insert the current thread * into the sleep queue already in use by this wait channel. */ if (sq == NULL) { #ifdef INVARIANTS int i; sq = td->td_sleepqueue; for (i = 0; i < NR_SLEEPQS; i++) { KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]), ("thread's sleep queue %d is not empty", i)); KASSERT(sq->sq_blockedcnt[i] == 0, ("thread's sleep queue %d count mismatches", i)); } KASSERT(LIST_EMPTY(&sq->sq_free), ("thread's sleep queue has a non-empty free list")); KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer")); sq->sq_lock = lock; #endif #ifdef SLEEPQUEUE_PROFILING sc->sc_depth++; if (sc->sc_depth > sc->sc_max_depth) { sc->sc_max_depth = sc->sc_depth; if (sc->sc_max_depth > sleepq_max_depth) sleepq_max_depth = sc->sc_max_depth; } #endif sq = td->td_sleepqueue; LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash); sq->sq_wchan = wchan; sq->sq_type = flags & SLEEPQ_TYPE; } else { MPASS(wchan == sq->sq_wchan); MPASS(lock == sq->sq_lock); MPASS((flags & SLEEPQ_TYPE) == sq->sq_type); LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash); } thread_lock(td); TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq); sq->sq_blockedcnt[queue]++; td->td_sleepqueue = NULL; td->td_sqqueue = queue; td->td_wchan = wchan; td->td_wmesg = wmesg; if (flags & SLEEPQ_INTERRUPTIBLE) { td->td_flags |= TDF_SINTR; td->td_flags &= ~TDF_SLEEPABORT; } thread_unlock(td); } /* * Sets a timeout that will remove the current thread from the specified * sleep queue after timo ticks if the thread has not already been awakened. */ void sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr, int flags) { + struct sleepqueue_chain *sc; struct thread *td; + sbintime_t pr1; td = curthread; - - mtx_lock_spin(&td->td_slpmutex); + sc = SC_LOOKUP(wchan); + mtx_assert(&sc->sc_lock, MA_OWNED); + MPASS(TD_ON_SLEEPQ(td)); + MPASS(td->td_sleepqueue == NULL); + MPASS(wchan != NULL); if (cold) panic("timed sleep before timers are working"); - callout_reset_sbt_on(&td->td_slpcallout, sbt, pr, - sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC); - mtx_unlock_spin(&td->td_slpmutex); + KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx", + td->td_tid, td, (uintmax_t)td->td_sleeptimo)); + thread_lock(td); + callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1); + thread_unlock(td); + callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1, + sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC | + C_DIRECT_EXEC); } /* * Return the number of actual sleepers for the specified queue. */ u_int sleepq_sleepcnt(void *wchan, int queue) { struct sleepqueue *sq; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); return (sq->sq_blockedcnt[queue]); } /* * Marks the pending sleep of the current thread as interruptible and * makes an initial check for pending signals before putting a thread * to sleep. Enters and exits with the thread lock held. Thread lock * may have transitioned from the sleepq lock to a run lock. */ static int sleepq_catch_signals(void *wchan, int pri) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; struct proc *p; struct sigacts *ps; int sig, ret; td = curthread; p = curproc; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(wchan != NULL); if ((td->td_pflags & TDP_WAKEUP) != 0) { td->td_pflags &= ~TDP_WAKEUP; ret = EINTR; thread_lock(td); goto out; } /* * See if there are any pending signals for this thread. If not * we can switch immediately. Otherwise do the signal processing * directly. */ thread_lock(td); if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) == 0) { sleepq_switch(wchan, pri); return (0); } thread_unlock(td); mtx_unlock_spin(&sc->sc_lock); CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)", (void *)td, (long)p->p_pid, td->td_name); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig = cursig(td); if (sig == -1) { mtx_unlock(&ps->ps_mtx); KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); KASSERT(TD_SBDRY_INTR(td), ("lost TDF_SERESTART of TDF_SEINTR")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); ret = TD_SBDRY_ERRNO(td); } else if (sig == 0) { mtx_unlock(&ps->ps_mtx); ret = thread_suspend_check(1); MPASS(ret == 0 || ret == EINTR || ret == ERESTART); } else { if (SIGISMEMBER(ps->ps_sigintr, sig)) ret = EINTR; else ret = ERESTART; mtx_unlock(&ps->ps_mtx); } /* * Lock the per-process spinlock prior to dropping the PROC_LOCK * to avoid a signal delivery race. PROC_LOCK, PROC_SLOCK, and * thread_lock() are currently held in tdsendsignal(). */ PROC_SLOCK(p); mtx_lock_spin(&sc->sc_lock); PROC_UNLOCK(p); thread_lock(td); PROC_SUNLOCK(p); if (ret == 0) { sleepq_switch(wchan, pri); return (0); } out: /* * There were pending signals and this thread is still * on the sleep queue, remove it from the sleep queue. */ if (TD_ON_SLEEPQ(td)) { sq = sleepq_lookup(wchan); if (sleepq_resume_thread(sq, td, 0)) { #ifdef INVARIANTS /* * This thread hasn't gone to sleep yet, so it * should not be swapped out. */ panic("not waking up swapper"); #endif } } mtx_unlock_spin(&sc->sc_lock); MPASS(td->td_lock != &sc->sc_lock); return (ret); } /* * Switches to another thread if we are still asleep on a sleep queue. * Returns with thread lock. */ static void sleepq_switch(void *wchan, int pri) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If we have a sleep queue, then we've already been woken up, so * just return. */ if (td->td_sleepqueue != NULL) { mtx_unlock_spin(&sc->sc_lock); return; } /* * If TDF_TIMEOUT is set, then our sleep has been timed out * already but we are still on the sleep queue, so dequeue the * thread and return. */ if (td->td_flags & TDF_TIMEOUT) { MPASS(TD_ON_SLEEPQ(td)); sq = sleepq_lookup(wchan); if (sleepq_resume_thread(sq, td, 0)) { #ifdef INVARIANTS /* * This thread hasn't gone to sleep yet, so it * should not be swapped out. */ panic("not waking up swapper"); #endif } mtx_unlock_spin(&sc->sc_lock); return; } #ifdef SLEEPQUEUE_PROFILING if (prof_enabled) sleepq_profile(td->td_wmesg); #endif MPASS(td->td_sleepqueue == NULL); sched_sleep(td, pri); thread_lock_set(td, &sc->sc_lock); SDT_PROBE0(sched, , , sleep); TD_SET_SLEEPING(td); mi_switch(SW_VOL | SWT_SLEEPQ, NULL); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); } /* * Check to see if we timed out. */ static int -sleepq_check_timeout(struct thread *td) +sleepq_check_timeout(void) { + struct thread *td; + int res; + + td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* - * If TDF_TIMEOUT is set, we timed out. + * If TDF_TIMEOUT is set, we timed out. But recheck + * td_sleeptimo anyway. */ - if (td->td_flags & TDF_TIMEOUT) { - td->td_flags &= ~TDF_TIMEOUT; - return (EWOULDBLOCK); + res = 0; + if (td->td_sleeptimo != 0) { + if (td->td_sleeptimo <= sbinuptime()) + res = EWOULDBLOCK; + td->td_sleeptimo = 0; } - return (0); + if (td->td_flags & TDF_TIMEOUT) + td->td_flags &= ~TDF_TIMEOUT; + else + /* + * We ignore the situation where timeout subsystem was + * unable to stop our callout. The struct thread is + * type-stable, the callout will use the correct + * memory when running. The checks of the + * td_sleeptimo value in this function and in + * sleepq_timeout() ensure that the thread does not + * get spurious wakeups, even if the callout was reset + * or thread reused. + */ + callout_stop(&td->td_slpcallout); + return (res); } /* - * Atomically stop the timeout by using a mutex. - */ -static void -sleepq_stop_timeout(struct thread *td) -{ - mtx_lock_spin(&td->td_slpmutex); - callout_stop(&td->td_slpcallout); - mtx_unlock_spin(&td->td_slpmutex); -} - -/* * Check to see if we were awoken by a signal. */ static int sleepq_check_signals(void) { struct thread *td; td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* We are no longer in an interruptible sleep. */ if (td->td_flags & TDF_SINTR) td->td_flags &= ~TDF_SINTR; if (td->td_flags & TDF_SLEEPABORT) { td->td_flags &= ~TDF_SLEEPABORT; return (td->td_intrval); } return (0); } /* * Block the current thread until it is awakened from its sleep queue. */ void sleepq_wait(void *wchan, int pri) { struct thread *td; td = curthread; MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); thread_unlock(td); } /* * Block the current thread until it is awakened from its sleep queue * or it is interrupted by a signal. */ int sleepq_wait_sig(void *wchan, int pri) { int rcatch; int rval; rcatch = sleepq_catch_signals(wchan, pri); rval = sleepq_check_signals(); thread_unlock(curthread); if (rcatch) return (rcatch); return (rval); } /* * Block the current thread until it is awakened from its sleep queue * or it times out while waiting. */ int sleepq_timedwait(void *wchan, int pri) { struct thread *td; int rval; td = curthread; MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); - rval = sleepq_check_timeout(td); + rval = sleepq_check_timeout(); thread_unlock(td); - sleepq_stop_timeout(td); - return (rval); } /* * Block the current thread until it is awakened from its sleep queue, * it is interrupted by a signal, or it times out waiting to be awakened. */ int sleepq_timedwait_sig(void *wchan, int pri) { - struct thread *td; int rcatch, rvalt, rvals; - td = curthread; - rcatch = sleepq_catch_signals(wchan, pri); - rvalt = sleepq_check_timeout(td); + rvalt = sleepq_check_timeout(); rvals = sleepq_check_signals(); - thread_unlock(td); - - sleepq_stop_timeout(td); - + thread_unlock(curthread); if (rcatch) return (rcatch); if (rvals) return (rvals); return (rvalt); } /* * Returns the type of sleepqueue given a waitchannel. */ int sleepq_type(void *wchan) { struct sleepqueue *sq; int type; MPASS(wchan != NULL); sleepq_lock(wchan); sq = sleepq_lookup(wchan); if (sq == NULL) { sleepq_release(wchan); return (-1); } type = sq->sq_type; sleepq_release(wchan); return (type); } /* * Removes a thread from a sleep queue and makes it * runnable. */ static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri) { struct sleepqueue_chain *sc; MPASS(td != NULL); MPASS(sq->sq_wchan != NULL); MPASS(td->td_wchan == sq->sq_wchan); MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0); THREAD_LOCK_ASSERT(td, MA_OWNED); sc = SC_LOOKUP(sq->sq_wchan); mtx_assert(&sc->sc_lock, MA_OWNED); SDT_PROBE2(sched, , , wakeup, td, td->td_proc); /* Remove the thread from the queue. */ sq->sq_blockedcnt[td->td_sqqueue]--; TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq); /* * Get a sleep queue for this thread. If this is the last waiter, * use the queue itself and take it out of the chain, otherwise, * remove a queue from the free list. */ if (LIST_EMPTY(&sq->sq_free)) { td->td_sleepqueue = sq; #ifdef INVARIANTS sq->sq_wchan = NULL; #endif #ifdef SLEEPQUEUE_PROFILING sc->sc_depth--; #endif } else td->td_sleepqueue = LIST_FIRST(&sq->sq_free); LIST_REMOVE(td->td_sleepqueue, sq_hash); td->td_wmesg = NULL; td->td_wchan = NULL; td->td_flags &= ~TDF_SINTR; CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, td->td_name); /* Adjust priority if requested. */ MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX)); if (pri != 0 && td->td_priority > pri && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, pri); /* * Note that thread td might not be sleeping if it is running * sleepq_catch_signals() on another CPU or is blocked on its * proc lock to check signals. There's no need to mark the * thread runnable in that case. */ if (TD_IS_SLEEPING(td)) { TD_CLR_SLEEPING(td); return (setrunnable(td)); } return (0); } #ifdef INVARIANTS /* * UMA zone item deallocator. */ static void sleepq_dtor(void *mem, int size, void *arg) { struct sleepqueue *sq; int i; sq = mem; for (i = 0; i < NR_SLEEPQS; i++) { MPASS(TAILQ_EMPTY(&sq->sq_blocked[i])); MPASS(sq->sq_blockedcnt[i] == 0); } } #endif /* * UMA zone item initializer. */ static int sleepq_init(void *mem, int size, int flags) { struct sleepqueue *sq; int i; bzero(mem, size); sq = mem; for (i = 0; i < NR_SLEEPQS; i++) { TAILQ_INIT(&sq->sq_blocked[i]); sq->sq_blockedcnt[i] = 0; } LIST_INIT(&sq->sq_free); return (0); } /* * Find the highest priority thread sleeping on a wait channel and resume it. */ int sleepq_signal(void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; struct thread *td, *besttd; int wakeup_swapper; CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); /* * Find the highest priority thread on the queue. If there is a * tie, use the thread that first appears in the queue as it has * been sleeping the longest since threads are always added to * the tail of sleep queues. */ besttd = NULL; TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) { if (besttd == NULL || td->td_priority < besttd->td_priority) besttd = td; } MPASS(besttd != NULL); thread_lock(besttd); wakeup_swapper = sleepq_resume_thread(sq, besttd, pri); thread_unlock(besttd); return (wakeup_swapper); } /* * Resume all threads sleeping on a specified wait channel. */ int sleepq_broadcast(void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; struct thread *td; int wakeup_swapper; CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); /* Resume all blocked threads on the sleep queue. */ wakeup_swapper = 0; while ((td = TAILQ_FIRST(&sq->sq_blocked[queue])) != NULL) { thread_lock(td); wakeup_swapper |= sleepq_resume_thread(sq, td, pri); thread_unlock(td); } return (wakeup_swapper); } /* * Time sleeping threads out. When the timeout expires, the thread is * removed from the sleep queue and made runnable if it is still asleep. */ static void sleepq_timeout(void *arg) { - struct thread *td = arg; - int wakeup_swapper = 0; + struct sleepqueue_chain *sc; + struct sleepqueue *sq; + struct thread *td; + void *wchan; + int wakeup_swapper; + td = arg; + wakeup_swapper = 0; CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); - /* Handle the three cases which can happen */ - thread_lock(td); - if (TD_ON_SLEEPQ(td)) { - if (TD_IS_SLEEPING(td)) { - struct sleepqueue_chain *sc; - struct sleepqueue *sq; - void *wchan; - /* - * Case I - thread is asleep and needs to be - * awoken: - */ - wchan = td->td_wchan; - sc = SC_LOOKUP(wchan); - THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); - sq = sleepq_lookup(wchan); - MPASS(sq != NULL); - td->td_flags |= TDF_TIMEOUT; - wakeup_swapper = sleepq_resume_thread(sq, td, 0); - } else { - /* - * Case II - cancel going to sleep by setting - * the timeout flag because the target thread - * is not asleep yet. It can be on another CPU - * in between sleepq_add() and one of the - * sleepq_*wait*() routines or it can be in - * sleepq_catch_signals(). - */ - td->td_flags |= TDF_TIMEOUT; - } - } else { + if (td->td_sleeptimo > sbinuptime() || td->td_sleeptimo == 0) { /* - * Case III - thread is already woken up by a wakeup - * call and should not timeout. Nothing to do! + * The thread does not want a timeout (yet). */ + } else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { + /* + * See if the thread is asleep and get the wait + * channel if it is. + */ + wchan = td->td_wchan; + sc = SC_LOOKUP(wchan); + THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); + sq = sleepq_lookup(wchan); + MPASS(sq != NULL); + td->td_flags |= TDF_TIMEOUT; + wakeup_swapper = sleepq_resume_thread(sq, td, 0); + } else if (TD_ON_SLEEPQ(td)) { + /* + * If the thread is on the SLEEPQ but isn't sleeping + * yet, it can either be on another CPU in between + * sleepq_add() and one of the sleepq_*wait*() + * routines or it can be in sleepq_catch_signals(). + */ + td->td_flags |= TDF_TIMEOUT; } + thread_unlock(td); if (wakeup_swapper) kick_proc0(); } /* * Resumes a specific thread from the sleep queue associated with a specific * wait channel if it is on that queue. */ void sleepq_remove(struct thread *td, void *wchan) { struct sleepqueue *sq; int wakeup_swapper; /* * Look up the sleep queue for this wait channel, then re-check * that the thread is asleep on that channel, if it is not, then * bail. */ MPASS(wchan != NULL); sleepq_lock(wchan); sq = sleepq_lookup(wchan); /* * We can not lock the thread here as it may be sleeping on a * different sleepq. However, holding the sleepq lock for this * wchan can guarantee that we do not miss a wakeup for this * channel. The asserts below will catch any false positives. */ if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) { sleepq_release(wchan); return; } /* Thread is asleep on sleep queue sq, so wake it up. */ thread_lock(td); MPASS(sq != NULL); MPASS(td->td_wchan == wchan); wakeup_swapper = sleepq_resume_thread(sq, td, 0); thread_unlock(td); sleepq_release(wchan); if (wakeup_swapper) kick_proc0(); } /* * Abort a thread as if an interrupt had occurred. Only abort * interruptible waits (unfortunately it isn't safe to abort others). */ int sleepq_abort(struct thread *td, int intrval) { struct sleepqueue *sq; void *wchan; THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_flags & TDF_SINTR); MPASS(intrval == EINTR || intrval == ERESTART); /* * If the TDF_TIMEOUT flag is set, just leave. A * timeout is scheduled anyhow. */ if (td->td_flags & TDF_TIMEOUT) return (0); CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); td->td_intrval = intrval; td->td_flags |= TDF_SLEEPABORT; /* * If the thread has not slept yet it will find the signal in * sleepq_catch_signals() and call sleepq_resume_thread. Otherwise * we have to do it here. */ if (!TD_IS_SLEEPING(td)) return (0); wchan = td->td_wchan; MPASS(wchan != NULL); sq = sleepq_lookup(wchan); MPASS(sq != NULL); /* Thread is asleep on sleep queue sq, so wake it up. */ return (sleepq_resume_thread(sq, td, 0)); } /* * Prints the stacks of all threads presently sleeping on wchan/queue to * the sbuf sb. Sets count_stacks_printed to the number of stacks actually * printed. Typically, this will equal the number of threads sleeping on the * queue, but may be less if sb overflowed before all stacks were printed. */ #ifdef STACK int sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue, int *count_stacks_printed) { struct thread *td, *td_next; struct sleepqueue *sq; struct stack **st; struct sbuf **td_infos; int i, stack_idx, error, stacks_to_allocate; bool finished, partial_print; error = 0; finished = false; partial_print = false; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); stacks_to_allocate = 10; for (i = 0; i < 3 && !finished ; i++) { /* We cannot malloc while holding the queue's spinlock, so * we do our mallocs now, and hope it is enough. If it * isn't, we will free these, drop the lock, malloc more, * and try again, up to a point. After that point we will * give up and report ENOMEM. We also cannot write to sb * during this time since the client may have set the * SBUF_AUTOEXTEND flag on their sbuf, which could cause a * malloc as we print to it. So we defer actually printing * to sb until after we drop the spinlock. */ /* Where we will store the stacks. */ st = malloc(sizeof(struct stack *) * stacks_to_allocate, M_TEMP, M_WAITOK); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) st[stack_idx] = stack_create(); /* Where we will store the td name, tid, etc. */ td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate, M_TEMP, M_WAITOK); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) td_infos[stack_idx] = sbuf_new(NULL, NULL, MAXCOMLEN + sizeof(struct thread *) * 2 + 40, SBUF_FIXEDLEN); sleepq_lock(wchan); sq = sleepq_lookup(wchan); if (sq == NULL) { /* This sleepq does not exist; exit and return ENOENT. */ error = ENOENT; finished = true; sleepq_release(wchan); goto loop_end; } stack_idx = 0; /* Save thread info */ TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, td_next) { if (stack_idx >= stacks_to_allocate) goto loop_end; /* Note the td_lock is equal to the sleepq_lock here. */ stack_save_td(st[stack_idx], td); sbuf_printf(td_infos[stack_idx], "%d: %s %p", td->td_tid, td->td_name, td); ++stack_idx; } finished = true; sleepq_release(wchan); /* Print the stacks */ for (i = 0; i < stack_idx; i++) { sbuf_finish(td_infos[i]); sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i])); stack_sbuf_print(sb, st[i]); sbuf_printf(sb, "\n"); error = sbuf_error(sb); if (error == 0) *count_stacks_printed = stack_idx; } loop_end: if (!finished) sleepq_release(wchan); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) stack_destroy(st[stack_idx]); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) sbuf_delete(td_infos[stack_idx]); free(st, M_TEMP); free(td_infos, M_TEMP); stacks_to_allocate *= 10; } if (!finished && error == 0) error = ENOMEM; return (error); } #endif #ifdef SLEEPQUEUE_PROFILING #define SLEEPQ_PROF_LOCATIONS 1024 #define SLEEPQ_SBUFSIZE 512 struct sleepq_prof { LIST_ENTRY(sleepq_prof) sp_link; const char *sp_wmesg; long sp_count; }; LIST_HEAD(sqphead, sleepq_prof); struct sqphead sleepq_prof_free; struct sqphead sleepq_hash[SC_TABLESIZE]; static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS]; static struct mtx sleepq_prof_lock; MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN); static void sleepq_profile(const char *wmesg) { struct sleepq_prof *sp; mtx_lock_spin(&sleepq_prof_lock); if (prof_enabled == 0) goto unlock; LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link) if (sp->sp_wmesg == wmesg) goto done; sp = LIST_FIRST(&sleepq_prof_free); if (sp == NULL) goto unlock; sp->sp_wmesg = wmesg; LIST_REMOVE(sp, sp_link); LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link); done: sp->sp_count++; unlock: mtx_unlock_spin(&sleepq_prof_lock); return; } static void sleepq_prof_reset(void) { struct sleepq_prof *sp; int enabled; int i; mtx_lock_spin(&sleepq_prof_lock); enabled = prof_enabled; prof_enabled = 0; for (i = 0; i < SC_TABLESIZE; i++) LIST_INIT(&sleepq_hash[i]); LIST_INIT(&sleepq_prof_free); for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) { sp = &sleepq_profent[i]; sp->sp_wmesg = NULL; sp->sp_count = 0; LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link); } prof_enabled = enabled; mtx_unlock_spin(&sleepq_prof_lock); } static int enable_sleepq_prof(SYSCTL_HANDLER_ARGS) { int error, v; v = prof_enabled; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == prof_enabled) return (0); if (v == 1) sleepq_prof_reset(); mtx_lock_spin(&sleepq_prof_lock); prof_enabled = !!v; mtx_unlock_spin(&sleepq_prof_lock); return (0); } static int reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); sleepq_prof_reset(); return (0); } static int dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS) { struct sleepq_prof *sp; struct sbuf *sb; int enabled; int error; int i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req); sbuf_printf(sb, "\nwmesg\tcount\n"); enabled = prof_enabled; mtx_lock_spin(&sleepq_prof_lock); prof_enabled = 0; mtx_unlock_spin(&sleepq_prof_lock); for (i = 0; i < SC_TABLESIZE; i++) { LIST_FOREACH(sp, &sleepq_hash[i], sp_link) { sbuf_printf(sb, "%s\t%ld\n", sp->sp_wmesg, sp->sp_count); } } mtx_lock_spin(&sleepq_prof_lock); prof_enabled = enabled; mtx_unlock_spin(&sleepq_prof_lock); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics"); SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, reset_sleepq_prof_stats, "I", "Reset sleepqueue profiling statistics"); SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling"); #endif #ifdef DDB DB_SHOW_COMMAND(sleepq, db_show_sleepqueue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; #ifdef INVARIANTS struct lock_object *lock; #endif struct thread *td; void *wchan; int i; if (!have_addr) return; /* * First, see if there is an active sleep queue for the wait channel * indicated by the address. */ wchan = (void *)addr; sc = SC_LOOKUP(wchan); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) if (sq->sq_wchan == wchan) goto found; /* * Second, see if there is an active sleep queue at the address * indicated. */ for (i = 0; i < SC_TABLESIZE; i++) LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) { if (sq == (struct sleepqueue *)addr) goto found; } db_printf("Unable to locate a sleep queue via %p\n", (void *)addr); return; found: db_printf("Wait channel: %p\n", sq->sq_wchan); db_printf("Queue type: %d\n", sq->sq_type); #ifdef INVARIANTS if (sq->sq_lock) { lock = sq->sq_lock; db_printf("Associated Interlock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name, lock->lo_name); } #endif db_printf("Blocked threads:\n"); for (i = 0; i < NR_SLEEPQS; i++) { db_printf("\nQueue[%d]:\n", i); if (TAILQ_EMPTY(&sq->sq_blocked[i])) db_printf("\tempty\n"); else TAILQ_FOREACH(td, &sq->sq_blocked[0], td_slpq) { db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); } db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]); } } /* Alias 'show sleepqueue' to 'show sleepq'. */ DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue); #endif Index: projects/hps_head/sys/sys/proc.h =================================================================== --- projects/hps_head/sys/sys/proc.h (revision 303925) +++ projects/hps_head/sys/sys/proc.h (revision 303926) @@ -1,1106 +1,1106 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; struct nlminfo; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct syscall_args; struct td_sched; struct thread; struct trapframe; struct turnstile; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ /* Cleared during fork1() */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ short td_locks; /* (k) Debug: count of non-spin locks */ short td_rw_rlocks; /* (k) Count of rwlock read locks. */ short td_lk_slocks; /* (k) Count of lockmgr shared locks. */ short td_stopsched; /* (k) Scheduler stopped. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ struct ksiginfo td_dbgksi; /* (c) ksi reflected to debugger. */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ int td_dom_rr_idx; /* (k) RR Numa domain selection. */ void *td_su; /* (k) FFS SU private */ + sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ u_int td_dbg_sc_code; /* (c) Syscall code to debugger. */ u_int td_dbg_sc_narg; /* (c) Syscall arg count to debugger.*/ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or create_thread() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ union { register_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ - struct mtx td_slpmutex; /* (h) Mutex for sleep callout */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ }; struct thread0_storage { struct thread t0st_thread; uint64_t t0st_sched[10]; }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m = (td)->td_lock; \ KASSERT((__m == &blocked_lock || __m == (lock)), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) ((td)->td_locks--) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_SERESTART 0x00080000 /* ERESTART on stop attempts. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_UNUSED9 0x00000100 /* --available-- */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_FORKING 0x20000000 /* Thread is being created through fork() */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN #define TD_SBDRY_INTR(td) \ (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0) #define TD_SBDRY_ERRNO(td) \ (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART) /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ /* End area that is copied on creation. */ #define p_endcopy p_xsig struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ struct cv p_dbgwait; /* (*) wait cv for debugger attach after fork. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KPROC 0x00004 /* Kernel process. */ #define P_FOLLOWFORK 0x00008 /* Attach parent debugger to children. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_WKILLED 0x08000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x2000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_LWP_EVENTS 0x00000010 /* Report LWP events via ptrace(2). */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_NONE 0 /* Unspecified switch. */ #define SWT_PREEMPT 1 /* Switching due to preemption. */ #define SWT_OWEPREEMPT 2 /* Switching due to opepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ #define SWT_UNUSED5 5 /* --available-- */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ #define SWT_IWAIT 9 /* Waiting for interrupts. */ #define SWT_SUSPEND 10 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ #define SWT_COUNT 13 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define STOPEVENT(p, e, v) do { \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* * Non-zero p_lock ensures that: * - exit1() is not performed until p_lock reaches zero; * - the process' threads stack are not swapped out if they are currently * not (P_INMEM). * * PHOLD() asserts that the process (except the current process) is * not exiting, increments p_lock and swaps threads stacks into memory, * if needed. * _PHOLD() is same as PHOLD(), it takes the process locked. * _PHOLD_LITE() also takes the process locked, but comparing with * _PHOLD(), it only guarantees that exit1() is not executed, * faultin() is not called. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define _PHOLD_LITE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_cowgen++; \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() ((curthread)->td_no_sleeping++) #define THREAD_SLEEPING_OK() ((curthread)->td_no_sleeping--) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; extern struct rwlock tidhash_lock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_locked(pid_t pid); struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_yield(int); void kick_proc0(void); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *); void setsugid(struct proc *p); int should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); void cpu_set_syscall_retval(struct thread *, int); void cpu_set_upcall(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } static __inline __pure2 struct td_sched * td_get_sched(struct thread *td) { return ((struct td_sched *)&td[1]); } #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */