Index: head/sys/kern/imgact_aout.c =================================================================== --- head/sys/kern/imgact_aout.c (revision 326270) +++ head/sys/kern/imgact_aout.c (revision 326271) @@ -1,339 +1,341 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __amd64__ #include #include #include #include #include #endif static int exec_aout_imgact(struct image_params *imgp); static int aout_fixup(register_t **stack_base, struct image_params *imgp); #if defined(__i386__) struct sysentvec aout_sysvec = { .sv_size = SYS_MAXSYSCALL, .sv_table = sysent, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = aout_fixup, .sv_sendsig = sendsig, .sv_sigcode = sigcode, .sv_szsigcode = &szsigcode, .sv_name = "FreeBSD a.out", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = cpu_fetch_syscall_args, .sv_syscallnames = syscallnames, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; #elif defined(__amd64__) #define AOUT32_USRSTACK 0xbfc00000 #define AOUT32_PS_STRINGS \ (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings)) #define AOUT32_MINUSER FREEBSD32_MINUSER extern const char *freebsd32_syscallnames[]; extern u_long ia32_maxssiz; struct sysentvec aout_sysvec = { .sv_size = FREEBSD32_SYS_MAXSYSCALL, .sv_table = freebsd32_sysent, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = aout_fixup, .sv_sendsig = ia32_sendsig, .sv_sigcode = ia32_sigcode, .sv_szsigcode = &sz_ia32_sigcode, .sv_name = "FreeBSD a.out", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = MINSIGSTKSZ, .sv_pagesize = IA32_PAGE_SIZE, .sv_minuser = AOUT32_MINUSER, .sv_maxuser = AOUT32_USRSTACK, .sv_usrstack = AOUT32_USRSTACK, .sv_psstrings = AOUT32_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = freebsd32_copyout_strings, .sv_setregs = ia32_setregs, .sv_fixlimit = ia32_fixlimit, .sv_maxssiz = &ia32_maxssiz, .sv_flags = SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32, .sv_set_syscall_retval = ia32_set_syscall_retval, .sv_fetch_syscall_args = ia32_fetch_syscall_args, .sv_syscallnames = freebsd32_syscallnames, }; #else #error "Port me" #endif static int aout_fixup(register_t **stack_base, struct image_params *imgp) { *(char **)stack_base -= sizeof(uint32_t); return (suword32(*stack_base, imgp->args->argc)); } static int exec_aout_imgact(struct image_params *imgp) { const struct exec *a_out = (const struct exec *) imgp->image_header; struct vmspace *vmspace; vm_map_t map; vm_object_t object; vm_offset_t text_end, data_end; unsigned long virtual_offset; unsigned long file_offset; unsigned long bss_size; int error; /* * Linux and *BSD binaries look very much alike, * only the machine id is different: * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. * NetBSD is in network byte order.. ugh. */ if (((a_out->a_midmag >> 16) & 0xff) != 0x86 && ((a_out->a_midmag >> 16) & 0xff) != 0 && ((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86) return -1; /* * Set file/virtual offset based on a.out variant. * We do two cases: host byte order and network byte order * (for NetBSD compatibility) */ switch ((int)(a_out->a_midmag & 0xffff)) { case ZMAGIC: virtual_offset = 0; if (a_out->a_text) { file_offset = PAGE_SIZE; } else { /* Bill's "screwball mode" */ file_offset = 0; } break; case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; /* Pass PS_STRINGS for BSD/OS binaries only. */ if (N_GETMID(*a_out) == MID_ZERO) imgp->ps_strings = aout_sysvec.sv_psstrings; break; default: /* NetBSD compatibility */ switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) { case ZMAGIC: case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; break; default: return (-1); } } bss_size = roundup(a_out->a_bss, PAGE_SIZE); /* * Check various fields in header for validity/bounds. */ if (/* entry point must lay with text region */ a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || /* text and data size must each be page rounded */ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK #ifdef __amd64__ || /* overflows */ virtual_offset + a_out->a_text + a_out->a_data + bss_size > UINT_MAX #endif ) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ PROC_LOCK(imgp->proc); if (/* text can't exceed maximum text size */ a_out->a_text > maxtsiz || /* data + bss can't exceed rlimit */ a_out->a_data + bss_size > lim_cur_proc(imgp->proc, RLIMIT_DATA) || racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } PROC_UNLOCK(imgp->proc); /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. */ VOP_UNLOCK(imgp->vp, 0); /* * Destroy old process VM and create a new one (with a new stack) */ error = exec_new_vmspace(imgp, &aout_sysvec); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); /* * The vm space can be changed by exec_new_vmspace */ vmspace = imgp->proc->p_vmspace; object = imgp->object; map = &vmspace->vm_map; vm_map_lock(map); vm_object_reference(object); text_end = virtual_offset + a_out->a_text; error = vm_map_insert(map, object, file_offset, virtual_offset, text_end, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); vm_object_deallocate(object); return (error); } data_end = text_end + a_out->a_data; if (a_out->a_data) { vm_object_reference(object); error = vm_map_insert(map, object, file_offset + a_out->a_text, text_end, data_end, VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); vm_object_deallocate(object); return (error); } } if (bss_size) { error = vm_map_insert(map, NULL, 0, data_end, data_end + bss_size, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_map_unlock(map); return (error); } } vm_map_unlock(map); /* Fill in process VM information */ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (virtual_offset + a_out->a_text); /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &aout_sysvec; return (0); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; EXEC_SET(aout, aout_execsw); Index: head/sys/kern/imgact_elf.c =================================================================== --- head/sys/kern/imgact_elf.c (revision 326270) +++ head/sys/kern/imgact_elf.c (revision 326271) @@ -1,2481 +1,2483 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2017 Dell EMC * Copyright (c) 2000 David O'Brien * Copyright (c) 1995-1996 Søren Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_gzio.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ELF_NOTE_ROUNDSIZE 4 #define OLD_EI_BRAND 8 static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int interp_name_len, int32_t *osrel); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize); static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize); static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp); static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel); static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel); static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel); static vm_prot_t __elfN(trans_prot)(Elf_Word); static Elf_Word __elfN(untrans_prot)(vm_prot_t); SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0, ""); #define CORE_BUF_SIZE (16 * 1024) int __elfN(fallback_brand) = -1; SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, &elf_legacy_coredump, 0, "include all and only RW pages in core dumps"); int __elfN(nxstack) = #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ || \ (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__) 1; #else 0; #endif SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, nxstack, CTLFLAG_RW, &__elfN(nxstack), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack"); #if __ELF_WORD_SIZE == 32 #if defined(__amd64__) int i386_read_exec = 0; SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0, "enable execution from readable segments"); #endif #endif static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define trunc_page_ps(va, ps) rounddown2(va, ps) #define round_page_ps(va, ps) roundup2(va, ps) #define aligned(a, t) (trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a)) static const char FREEBSD_ABI_VENDOR[] = "FreeBSD"; Elf_Brandnote __elfN(freebsd_brandnote) = { .hdr.n_namesz = sizeof(FREEBSD_ABI_VENDOR), .hdr.n_descsz = sizeof(int32_t), .hdr.n_type = NT_FREEBSD_ABI_TAG, .vendor = FREEBSD_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = __elfN(freebsd_trans_osrel) }; static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel) { uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE); *osrel = *(const int32_t *)(p); return (TRUE); } static const char GNU_ABI_VENDOR[] = "GNU"; static int GNU_KFREEBSD_ABI_DESC = 3; Elf_Brandnote __elfN(kfreebsd_brandnote) = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = kfreebsd_trans_osrel }; static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE); desc = (const Elf32_Word *)p; if (desc[0] != GNU_KFREEBSD_ABI_DESC) return (FALSE); /* * Debian GNU/kFreeBSD embed the earliest compatible kernel version * (__FreeBSD_version: Rxx) in the LSB way. */ *osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3]; return (TRUE); } int __elfN(insert_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == NULL) { elf_brand_list[i] = entry; break; } } if (i == MAX_BRANDS) { printf("WARNING: %s: could not insert brandinfo entry: %p\n", __func__, entry); return (-1); } return (0); } int __elfN(remove_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == entry) { elf_brand_list[i] = NULL; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(brand_inuse)(Elf_Brandinfo *entry) { struct proc *p; int rval = FALSE; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_sysent == entry->sysvec) { rval = TRUE; break; } } sx_sunlock(&allproc_lock); return (rval); } static Elf_Brandinfo * __elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int interp_name_len, int32_t *osrel) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; Elf_Brandinfo *bi, *bi_m; boolean_t ret; int i; /* * We support four types of branding -- (1) the ELF EI_OSABI field * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string * branding w/in the ELF header, (3) path of the `interp_path' * field, and (4) the ".note.ABI-tag" ELF section. */ /* Look for an ".note.ABI-tag" ELF section */ bi_m = NULL; for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL) continue; if (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0) continue; if (hdr->e_machine == bi->machine && (bi->flags & (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) { ret = __elfN(check_note)(imgp, bi->brand_note, osrel); /* Give brand a chance to veto check_note's guess */ if (ret && bi->header_supported) ret = bi->header_supported(imgp); /* * If note checker claimed the binary, but the * interpreter path in the image does not * match default one for the brand, try to * search for other brands with the same * interpreter. Either there is better brand * with the right interpreter, or, failing * this, we return first brand which accepted * our note and, optionally, header. */ if (ret && bi_m == NULL && interp != NULL && (bi->interp_path == NULL || (strlen(bi->interp_path) + 1 != interp_name_len || strncmp(interp, bi->interp_path, interp_name_len) != 0))) { bi_m = bi; ret = 0; } if (ret) return (bi); } } if (bi_m != NULL) return (bi_m); /* If the executable has a brand, search for it in the brand list. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 || (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)) continue; if (hdr->e_machine == bi->machine && (hdr->e_ident[EI_OSABI] == bi->brand || (bi->compat_3_brand != NULL && strcmp((const char *)&hdr->e_ident[OLD_EI_BRAND], bi->compat_3_brand) == 0))) { /* Looks good, but give brand a chance to veto */ if (!bi->header_supported || bi->header_supported(imgp)) { /* * Again, prefer strictly matching * interpreter path. */ if (interp_name_len == 0 && bi->interp_path == NULL) return (bi); if (bi->interp_path != NULL && strlen(bi->interp_path) + 1 == interp_name_len && strncmp(interp, bi->interp_path, interp_name_len) == 0) return (bi); if (bi_m == NULL) bi_m = bi; } } } if (bi_m != NULL) return (bi_m); /* No known brand, see if the header is recognized by any brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY || bi->header_supported == NULL) continue; if (hdr->e_machine == bi->machine) { ret = bi->header_supported(imgp); if (ret) return (bi); } } /* Lacking a known brand, search for a recognized interpreter. */ if (interp != NULL) { for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || (bi->flags & (BI_BRAND_NOTE_MANDATORY | BI_BRAND_ONLY_STATIC)) != 0) continue; if (hdr->e_machine == bi->machine && bi->interp_path != NULL && /* ELF image p_filesz includes terminating zero */ strlen(bi->interp_path) + 1 == interp_name_len && strncmp(interp, bi->interp_path, interp_name_len) == 0) return (bi); } } /* Lacking a recognized interpreter, try the default brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 || (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)) continue; if (hdr->e_machine == bi->machine && __elfN(fallback_brand) == bi->brand) return (bi); } return (NULL); } static int __elfN(check_header)(const Elf_Ehdr *hdr) { Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_phentsize != sizeof(Elf_Phdr) || hdr->e_version != ELF_TARG_VER) return (ENOEXEC); /* * Make sure we have at least one brand for this machine. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && bi->machine == hdr->e_machine) break; } if (i == MAX_BRANDS) return (ENOEXEC); return (0); } static int __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { struct sf_buf *sf; int error; vm_offset_t off; /* * Create the page if it doesn't exist yet. Ignore errors. */ vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) - trunc_page(start), VM_PROT_ALL, VM_PROT_ALL, MAP_CHECK_EXCL); /* * Find the page from the underlying object. */ if (object != NULL) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, end - start); vm_imgact_unmap_page(sf); if (error != 0) return (KERN_FAILURE); } return (KERN_SUCCESS); } static int __elfN(map_insert)(struct image_params *imgp, vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow) { struct sf_buf *sf; vm_offset_t off; vm_size_t sz; int error, locked, rv; if (start != trunc_page(start)) { rv = __elfN(map_partial)(map, object, offset, start, round_page(start), prot); if (rv != KERN_SUCCESS) return (rv); offset += round_page(start) - start; start = round_page(start); } if (end != round_page(end)) { rv = __elfN(map_partial)(map, object, offset + trunc_page(end) - start, trunc_page(end), end, prot); if (rv != KERN_SUCCESS) return (rv); end = trunc_page(end); } if (start >= end) return (KERN_SUCCESS); if ((offset & PAGE_MASK) != 0) { /* * The mapping is not page aligned. This means that we have * to copy the data. */ rv = vm_map_fixed(map, NULL, 0, start, end - start, prot | VM_PROT_WRITE, VM_PROT_ALL, MAP_CHECK_EXCL); if (rv != KERN_SUCCESS) return (rv); if (object == NULL) return (KERN_SUCCESS); for (; start < end; start += sz) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); sz = end - start; if (sz > PAGE_SIZE - off) sz = PAGE_SIZE - off; error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, sz); vm_imgact_unmap_page(sf); if (error != 0) return (KERN_FAILURE); offset += sz; } } else { vm_object_reference(object); rv = vm_map_fixed(map, object, offset, start, end - start, prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL); if (rv != KERN_SUCCESS) { locked = VOP_ISLOCKED(imgp->vp); VOP_UNLOCK(imgp->vp, 0); vm_object_deallocate(object); vn_lock(imgp->vp, locked | LK_RETRY); return (rv); } } return (KERN_SUCCESS); } static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize) { struct sf_buf *sf; size_t map_len; vm_map_t map; vm_object_t object; vm_offset_t off, map_addr; int error, rv, cow; size_t copy_len; vm_ooffset_t file_addr; /* * It's necessary to fail if the filsz + offset taken from the * header is greater than the actual file pager object's size. * If we were to allow this, then the vm_map_find() below would * walk right off the end of the file object and into the ether. * * While I'm here, might as well check for something else that * is invalid: filsz cannot be greater than memsz. */ if ((filsz != 0 && (off_t)filsz + offset > imgp->attr->va_size) || filsz > memsz) { uprintf("elf_load_section: truncated ELF file\n"); return (ENOEXEC); } object = imgp->object; map = &imgp->proc->p_vmspace->vm_map; map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize); file_addr = trunc_page_ps(offset, pagesize); /* * We have two choices. We can either clear the data in the last page * of an oversized mapping, or we can start the anon mapping a page * early and copy the initialized data into that first page. We * choose the second. */ if (filsz == 0) map_len = 0; else if (memsz > filsz) map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr; else map_len = round_page_ps(offset + filsz, pagesize) - file_addr; if (map_len != 0) { /* cow flags: don't dump readonly sections in core */ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); rv = __elfN(map_insert)(imgp, map, object, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len,/* virtual end */ prot, cow); if (rv != KERN_SUCCESS) return (EINVAL); /* we can stop now if we've covered it all */ if (memsz == filsz) return (0); } /* * We have to get the remaining bit of the file into the first part * of the oversized map segment. This is normally because the .data * segment in the file is extended to provide bss. It's a neat idea * to try and save a page, but it's a pain in the behind to implement. */ copy_len = filsz == 0 ? 0 : (offset + filsz) - trunc_page_ps(offset + filsz, pagesize); map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize); map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) - map_addr; /* This had damn well better be true! */ if (map_len != 0) { rv = __elfN(map_insert)(imgp, map, NULL, 0, map_addr, map_addr + map_len, prot, 0); if (rv != KERN_SUCCESS) return (EINVAL); } if (copy_len != 0) { sf = vm_imgact_map_page(object, offset + filsz); if (sf == NULL) return (EIO); /* send the page fragment to user space */ off = trunc_page_ps(offset + filsz, pagesize) - trunc_page(offset + filsz); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)map_addr, copy_len); vm_imgact_unmap_page(sf); if (error != 0) return (error); } /* * Remove write access to the page if it was only granted by map_insert * to allow copyout. */ if ((prot & VM_PROT_WRITE) == 0) vm_map_protect(map, trunc_page(map_addr), round_page(map_addr + map_len), prot, FALSE); return (0); } /* * Load the file "file" into memory. It may be either a shared object * or an executable. * * The "addr" reference parameter is in/out. On entry, it specifies * the address where a shared object should be loaded. If the file is * an executable, this value is ignored. On exit, "addr" specifies * where the file was actually loaded. * * The "entry" reference parameter is out only. On exit, it specifies * the entry point for the loaded file. */ static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize) { struct { struct nameidata nd; struct vattr attr; struct image_params image_params; } *tempdata; const Elf_Ehdr *hdr = NULL; const Elf_Phdr *phdr = NULL; struct nameidata *nd; struct vattr *attr; struct image_params *imgp; vm_prot_t prot; u_long rbase; u_long base_addr = 0; int error, i, numsegs; #ifdef CAPABILITY_MODE /* * XXXJA: This check can go away once we are sufficiently confident * that the checks in namei() are correct. */ if (IN_CAPABILITY_MODE(curthread)) return (ECAPMODE); #endif tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK); nd = &tempdata->nd; attr = &tempdata->attr; imgp = &tempdata->image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->attr = attr; imgp->firstpage = NULL; imgp->image_header = NULL; imgp->object = NULL; imgp->execlabel = NULL; NDINIT(nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, curthread); if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; } NDFREE(nd, NDF_ONLY_PNBUF); imgp->vp = nd->ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); if (error) goto fail; error = exec_map_first_page(imgp); if (error) goto fail; /* * Also make certain that the interpreter stays the same, so set * its VV_TEXT flag, too. */ VOP_SET_TEXT(nd->ni_vp); imgp->object = nd->ni_vp->v_object; hdr = (const Elf_Ehdr *)imgp->image_header; if ((error = __elfN(check_header)(hdr)) != 0) goto fail; if (hdr->e_type == ET_DYN) rbase = *addr; else if (hdr->e_type == ET_EXEC) rbase = 0; else { error = ENOEXEC; goto fail; } /* Only support headers that fit within first page for now */ if ((hdr->e_phoff > PAGE_SIZE) || (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) { error = ENOEXEC; goto fail; } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) { error = ENOEXEC; goto fail; } for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) { /* Loadable segment */ prot = __elfN(trans_prot)(phdr[i].p_flags); error = __elfN(load_section)(imgp, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase, phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize); if (error != 0) goto fail; /* * Establish the base address if this is the * first segment. */ if (numsegs == 0) base_addr = trunc_page(phdr[i].p_vaddr + rbase); numsegs++; } } *addr = base_addr; *entry = (unsigned long)hdr->e_entry + rbase; fail: if (imgp->firstpage) exec_unmap_first_page(imgp); if (nd->ni_vp) vput(nd->ni_vp); free(tempdata, M_TEMP); return (error); } static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) { struct thread *td; const Elf_Ehdr *hdr; const Elf_Phdr *phdr; Elf_Auxargs *elf_auxargs; struct vmspace *vmspace; const char *err_str, *newinterp; char *interp, *interp_buf, *path; Elf_Brandinfo *brand_info; struct sysentvec *sv; vm_prot_t prot; u_long text_size, data_size, total_size, text_addr, data_addr; u_long seg_size, seg_addr, addr, baddr, et_dyn_addr, entry, proghdr; int32_t osrel; int error, i, n, interp_name_len, have_interp; hdr = (const Elf_Ehdr *)imgp->image_header; /* * Do we have a valid ELF header ? * * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later * if particular brand doesn't support it. */ if (__elfN(check_header)(hdr) != 0 || (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)) return (-1); /* * From here on down, we return an errno, not -1, as we've * detected an ELF file. */ if ((hdr->e_phoff > PAGE_SIZE) || (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) { /* Only support headers in first page for now */ uprintf("Program headers not in the first page\n"); return (ENOEXEC); } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) { uprintf("Unaligned program headers\n"); return (ENOEXEC); } n = error = 0; baddr = 0; osrel = 0; text_size = data_size = total_size = text_addr = data_addr = 0; entry = proghdr = 0; interp_name_len = 0; err_str = newinterp = NULL; interp = interp_buf = NULL; td = curthread; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: if (n == 0) baddr = phdr[i].p_vaddr; n++; break; case PT_INTERP: /* Path to interpreter */ if (phdr[i].p_filesz > MAXPATHLEN) { uprintf("Invalid PT_INTERP\n"); error = ENOEXEC; goto ret; } if (interp != NULL) { uprintf("Multiple PT_INTERP headers\n"); error = ENOEXEC; goto ret; } interp_name_len = phdr[i].p_filesz; if (phdr[i].p_offset > PAGE_SIZE || interp_name_len > PAGE_SIZE - phdr[i].p_offset) { VOP_UNLOCK(imgp->vp, 0); interp_buf = malloc(interp_name_len + 1, M_TEMP, M_WAITOK); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); error = vn_rdwr(UIO_READ, imgp->vp, interp_buf, interp_name_len, phdr[i].p_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td); if (error != 0) { uprintf("i/o error PT_INTERP\n"); goto ret; } interp_buf[interp_name_len] = '\0'; interp = interp_buf; } else { interp = __DECONST(char *, imgp->image_header) + phdr[i].p_offset; } break; case PT_GNU_STACK: if (__elfN(nxstack)) imgp->stack_prot = __elfN(trans_prot)(phdr[i].p_flags); imgp->stack_sz = phdr[i].p_memsz; break; } } brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len, &osrel); if (brand_info == NULL) { uprintf("ELF binary type \"%u\" not known.\n", hdr->e_ident[EI_OSABI]); error = ENOEXEC; goto ret; } et_dyn_addr = 0; if (hdr->e_type == ET_DYN) { if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0) { uprintf("Cannot execute shared object\n"); error = ENOEXEC; goto ret; } /* * Honour the base load address from the dso if it is * non-zero for some reason. */ if (baddr == 0) et_dyn_addr = ET_DYN_LOAD_ADDR; } sv = brand_info->sysvec; if (interp != NULL && brand_info->interp_newpath != NULL) newinterp = brand_info->interp_newpath; /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. * * The VV_TEXT flag prevents modifications to the executable while * the vnode is unlocked. */ VOP_UNLOCK(imgp->vp, 0); error = exec_new_vmspace(imgp, sv); imgp->proc->p_sysent = sv; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto ret; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: /* Loadable segment */ if (phdr[i].p_memsz == 0) break; prot = __elfN(trans_prot)(phdr[i].p_flags); error = __elfN(load_section)(imgp, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr, phdr[i].p_memsz, phdr[i].p_filesz, prot, sv->sv_pagesize); if (error != 0) goto ret; /* * If this segment contains the program headers, * remember their virtual address for the AT_PHDR * aux entry. Static binaries don't usually include * a PT_PHDR entry. */ if (phdr[i].p_offset == 0 && hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize <= phdr[i].p_filesz) proghdr = phdr[i].p_vaddr + hdr->e_phoff + et_dyn_addr; seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr); seg_size = round_page(phdr[i].p_memsz + phdr[i].p_vaddr + et_dyn_addr - seg_addr); /* * Make the largest executable segment the official * text segment and all others data. * * Note that obreak() assumes that data_addr + * data_size == end of data load area, and the ELF * file format expects segments to be sorted by * address. If multiple data segments exist, the * last one will be used. */ if (phdr[i].p_flags & PF_X && text_size < seg_size) { text_size = seg_size; text_addr = seg_addr; } else { data_size = seg_size; data_addr = seg_addr; } total_size += seg_size; break; case PT_PHDR: /* Program header table info */ proghdr = phdr[i].p_vaddr + et_dyn_addr; break; default: break; } } if (data_addr == 0 && data_size == 0) { data_addr = text_addr; data_size = text_size; } entry = (u_long)hdr->e_entry + et_dyn_addr; /* * Check limits. It should be safe to check the * limits after loading the segments since we do * not actually fault in all the segments pages. */ PROC_LOCK(imgp->proc); if (data_size > lim_cur_proc(imgp->proc, RLIMIT_DATA)) err_str = "Data segment size exceeds process limit"; else if (text_size > maxtsiz) err_str = "Text segment size exceeds system limit"; else if (total_size > lim_cur_proc(imgp->proc, RLIMIT_VMEM)) err_str = "Total segment size exceeds process limit"; else if (racct_set(imgp->proc, RACCT_DATA, data_size) != 0) err_str = "Data segment size exceeds resource limit"; else if (racct_set(imgp->proc, RACCT_VMEM, total_size) != 0) err_str = "Total segment size exceeds resource limit"; if (err_str != NULL) { PROC_UNLOCK(imgp->proc); uprintf("%s\n", err_str); error = ENOMEM; goto ret; } vmspace = imgp->proc->p_vmspace; vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; /* * We load the dynamic linker where a userland call * to mmap(0, ...) would put it. The rationale behind this * calculation is that it leaves room for the heap to grow to * its maximum allowed size. */ addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(td, RLIMIT_DATA)); PROC_UNLOCK(imgp->proc); imgp->entry_addr = entry; if (interp != NULL) { have_interp = FALSE; VOP_UNLOCK(imgp->vp, 0); if (brand_info->emul_path != NULL && brand_info->emul_path[0] != '\0') { path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path, interp); error = __elfN(load_file)(imgp->proc, path, &addr, &imgp->entry_addr, sv->sv_pagesize); free(path, M_TEMP); if (error == 0) have_interp = TRUE; } if (!have_interp && newinterp != NULL && (brand_info->interp_path == NULL || strcmp(interp, brand_info->interp_path) == 0)) { error = __elfN(load_file)(imgp->proc, newinterp, &addr, &imgp->entry_addr, sv->sv_pagesize); if (error == 0) have_interp = TRUE; } if (!have_interp) { error = __elfN(load_file)(imgp->proc, interp, &addr, &imgp->entry_addr, sv->sv_pagesize); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) { uprintf("ELF interpreter %s not found, error %d\n", interp, error); goto ret; } } else addr = et_dyn_addr; /* * Construct auxargs table (used by the fixup routine) */ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); elf_auxargs->execfd = -1; elf_auxargs->phdr = proghdr; elf_auxargs->phent = hdr->e_phentsize; elf_auxargs->phnum = hdr->e_phnum; elf_auxargs->pagesz = PAGE_SIZE; elf_auxargs->base = addr; elf_auxargs->flags = 0; elf_auxargs->entry = entry; elf_auxargs->hdr_eflags = hdr->e_flags; imgp->auxargs = elf_auxargs; imgp->interpreted = 0; imgp->reloc_base = addr; imgp->proc->p_osrel = osrel; imgp->proc->p_elf_machine = hdr->e_machine; imgp->proc->p_elf_flags = hdr->e_flags; ret: free(interp_buf, M_TEMP); return (error); } #define suword __CONCAT(suword, __ELF_WORD_SIZE) int __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp) { Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; Elf_Addr *base; Elf_Addr *pos; base = (Elf_Addr *)*stack_base; pos = base + (imgp->args->argc + imgp->args->envc + 2); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_EHDRFLAGS, args->hdr_eflags); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp); AUXARGS_ENTRY(pos, AT_OSRELDATE, imgp->proc->p_ucred->cr_prison->pr_osreldate); if (imgp->canary != 0) { AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary); AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen); } AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus); if (imgp->pagesizes != 0) { AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes); AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen); } if (imgp->sysent->sv_timekeep_base != 0) { AUXARGS_ENTRY(pos, AT_TIMEKEEP, imgp->sysent->sv_timekeep_base); } AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : imgp->sysent->sv_stackprot); if (imgp->sysent->sv_hwcap != NULL) AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap); if (imgp->sysent->sv_hwcap2 != NULL) AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword(base, (long)imgp->args->argc); *stack_base = (register_t *)base; return (0); } /* * Code for generating ELF core dumps. */ typedef void (*segment_callback)(vm_map_entry_t, void *); /* Closure for cb_put_phdr(). */ struct phdr_closure { Elf_Phdr *phdr; /* Program header to fill in */ Elf_Off offset; /* Offset of segment in core file */ }; /* Closure for cb_size_segment(). */ struct sseg_closure { int count; /* Count of writable segments. */ size_t size; /* Total size of all writable segments. */ }; typedef void (*outfunc_t)(void *, struct sbuf *, size_t *); struct note_info { int type; /* Note type. */ outfunc_t outfunc; /* Output function. */ void *outarg; /* Argument for the output function. */ size_t outsize; /* Output size. */ TAILQ_ENTRY(note_info) link; /* Link to the next note info. */ }; TAILQ_HEAD(note_info_list, note_info); /* Coredump output parameters. */ struct coredump_params { off_t offset; struct ucred *active_cred; struct ucred *file_cred; struct thread *td; struct vnode *vp; struct gzio_stream *gzs; }; static void cb_put_phdr(vm_map_entry_t, void *); static void cb_size_segment(vm_map_entry_t, void *); static int core_write(struct coredump_params *, const void *, size_t, off_t, enum uio_seg); static void each_dumpable_segment(struct thread *, segment_callback, void *); static int __elfN(corehdr)(struct coredump_params *, int, void *, size_t, struct note_info_list *, size_t); static void __elfN(prepare_notes)(struct thread *, struct note_info_list *, size_t *); static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t); static void __elfN(putnote)(struct note_info *, struct sbuf *); static size_t register_note(struct note_info_list *, int, outfunc_t, void *); static int sbuf_drain_core_output(void *, const char *, int); static int sbuf_drain_count(void *arg, const char *data, int len); static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *); static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *); static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *); static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *); static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *); static void __elfN(note_ptlwpinfo)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *); static void note_procstat_files(void *, struct sbuf *, size_t *); static void note_procstat_groups(void *, struct sbuf *, size_t *); static void note_procstat_osrel(void *, struct sbuf *, size_t *); static void note_procstat_rlimit(void *, struct sbuf *, size_t *); static void note_procstat_umask(void *, struct sbuf *, size_t *); static void note_procstat_vmmap(void *, struct sbuf *, size_t *); #ifdef GZIO extern int compress_user_cores_gzlevel; /* * Write out a core segment to the compression stream. */ static int compress_chunk(struct coredump_params *p, char *base, char *buf, u_int len) { u_int chunk_len; int error; while (len > 0) { chunk_len = MIN(len, CORE_BUF_SIZE); /* * We can get EFAULT error here. * In that case zero out the current chunk of the segment. */ error = copyin(base, buf, chunk_len); if (error != 0) bzero(buf, chunk_len); error = gzio_write(p->gzs, buf, chunk_len); if (error != 0) break; base += chunk_len; len -= chunk_len; } return (error); } static int core_gz_write(void *base, size_t len, off_t offset, void *arg) { return (core_write((struct coredump_params *)arg, base, len, offset, UIO_SYSSPACE)); } #endif /* GZIO */ static int core_write(struct coredump_params *p, const void *base, size_t len, off_t offset, enum uio_seg seg) { return (vn_rdwr_inchunks(UIO_WRITE, p->vp, __DECONST(void *, base), len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED, p->active_cred, p->file_cred, NULL, p->td)); } static int core_output(void *base, size_t len, off_t offset, struct coredump_params *p, void *tmpbuf) { int error; #ifdef GZIO if (p->gzs != NULL) return (compress_chunk(p, base, tmpbuf, len)); #endif /* * EFAULT is a non-fatal error that we can get, for example, * if the segment is backed by a file but extends beyond its * end. */ error = core_write(p, base, len, offset, UIO_USERSPACE); if (error == EFAULT) { log(LOG_WARNING, "Failed to fully fault in a core file segment " "at VA %p with size 0x%zx to be written at offset 0x%jx " "for process %s\n", base, len, offset, curproc->p_comm); /* * Write a "real" zero byte at the end of the target region * in the case this is the last segment. * The intermediate space will be implicitly zero-filled. */ error = core_write(p, zero_region, 1, offset + len - 1, UIO_SYSSPACE); } return (error); } /* * Drain into a core file. */ static int sbuf_drain_core_output(void *arg, const char *data, int len) { struct coredump_params *p; int error, locked; p = (struct coredump_params *)arg; /* * Some kern_proc out routines that print to this sbuf may * call us with the process lock held. Draining with the * non-sleepable lock held is unsafe. The lock is needed for * those routines when dumping a live process. In our case we * can safely release the lock before draining and acquire * again after. */ locked = PROC_LOCKED(p->td->td_proc); if (locked) PROC_UNLOCK(p->td->td_proc); #ifdef GZIO if (p->gzs != NULL) error = gzio_write(p->gzs, __DECONST(char *, data), len); else #endif error = core_write(p, __DECONST(void *, data), len, p->offset, UIO_SYSSPACE); if (locked) PROC_LOCK(p->td->td_proc); if (error != 0) return (-error); p->offset += len; return (len); } /* * Drain into a counter. */ static int sbuf_drain_count(void *arg, const char *data __unused, int len) { size_t *sizep; sizep = (size_t *)arg; *sizep += len; return (len); } int __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) { struct ucred *cred = td->td_ucred; int error = 0; struct sseg_closure seginfo; struct note_info_list notelst; struct coredump_params params; struct note_info *ninfo; void *hdr, *tmpbuf; size_t hdrsize, notesz, coresize; #ifdef GZIO boolean_t compress; compress = (flags & IMGACT_CORE_COMPRESS) != 0; #endif hdr = NULL; tmpbuf = NULL; TAILQ_INIT(¬elst); /* Size the program segments. */ seginfo.count = 0; seginfo.size = 0; each_dumpable_segment(td, cb_size_segment, &seginfo); /* * Collect info about the core file header area. */ hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count); if (seginfo.count + 1 >= PN_XNUM) hdrsize += sizeof(Elf_Shdr); __elfN(prepare_notes)(td, ¬elst, ¬esz); coresize = round_page(hdrsize + notesz) + seginfo.size; /* Set up core dump parameters. */ params.offset = 0; params.active_cred = cred; params.file_cred = NOCRED; params.td = td; params.vp = vp; params.gzs = NULL; #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_CORE, coresize); PROC_UNLOCK(td->td_proc); if (error != 0) { error = EFAULT; goto done; } } #endif if (coresize >= limit) { error = EFAULT; goto done; } #ifdef GZIO /* Create a compression stream if necessary. */ if (compress) { params.gzs = gzio_init(core_gz_write, GZIO_DEFLATE, CORE_BUF_SIZE, compress_user_cores_gzlevel, ¶ms); if (params.gzs == NULL) { error = EFAULT; goto done; } tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO); } #endif /* * Allocate memory for building the header, fill it up, * and write it out following the notes. */ hdr = malloc(hdrsize, M_TEMP, M_WAITOK); error = __elfN(corehdr)(¶ms, seginfo.count, hdr, hdrsize, ¬elst, notesz); /* Write the contents of all of the writable segments. */ if (error == 0) { Elf_Phdr *php; off_t offset; int i; php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; offset = round_page(hdrsize + notesz); for (i = 0; i < seginfo.count; i++) { error = core_output((caddr_t)(uintptr_t)php->p_vaddr, php->p_filesz, offset, ¶ms, tmpbuf); if (error != 0) break; offset += php->p_filesz; php++; } #ifdef GZIO if (error == 0 && compress) error = gzio_flush(params.gzs); #endif } if (error) { log(LOG_WARNING, "Failed to write core file for process %s (error %d)\n", curproc->p_comm, error); } done: #ifdef GZIO if (compress) { free(tmpbuf, M_TEMP); if (params.gzs != NULL) gzio_fini(params.gzs); } #endif while ((ninfo = TAILQ_FIRST(¬elst)) != NULL) { TAILQ_REMOVE(¬elst, ninfo, link); free(ninfo, M_TEMP); } if (hdr != NULL) free(hdr, M_TEMP); return (error); } /* * A callback for each_dumpable_segment() to write out the segment's * program header entry. */ static void cb_put_phdr(entry, closure) vm_map_entry_t entry; void *closure; { struct phdr_closure *phc = (struct phdr_closure *)closure; Elf_Phdr *phdr = phc->phdr; phc->offset = round_page(phc->offset); phdr->p_type = PT_LOAD; phdr->p_offset = phc->offset; phdr->p_vaddr = entry->start; phdr->p_paddr = 0; phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; phdr->p_align = PAGE_SIZE; phdr->p_flags = __elfN(untrans_prot)(entry->protection); phc->offset += phdr->p_filesz; phc->phdr++; } /* * A callback for each_dumpable_segment() to gather information about * the number of segments and their total size. */ static void cb_size_segment(vm_map_entry_t entry, void *closure) { struct sseg_closure *ssc = (struct sseg_closure *)closure; ssc->count++; ssc->size += entry->end - entry->start; } /* * For each writable segment in the process's memory map, call the given * function with a pointer to the map entry and some arbitrary * caller-supplied data. */ static void each_dumpable_segment(struct thread *td, segment_callback func, void *closure) { struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; vm_object_t backing_object, object; boolean_t ignore_entry; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { /* * Don't dump inaccessible mappings, deal with legacy * coredump mode. * * Note that read-only segments related to the elf binary * are marked MAP_ENTRY_NOCOREDUMP now so we no longer * need to arbitrarily ignore such segments. */ if (elf_legacy_coredump) { if ((entry->protection & VM_PROT_RW) != VM_PROT_RW) continue; } else { if ((entry->protection & VM_PROT_ALL) == 0) continue; } /* * Dont include memory segment in the coredump if * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in * madvise(2). Do not dump submaps (i.e. parts of the * kernel map). */ if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP)) continue; if ((object = entry->object.vm_object) == NULL) continue; /* Ignore memory-mapped devices and such things. */ VM_OBJECT_RLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_RLOCK(backing_object); VM_OBJECT_RUNLOCK(object); object = backing_object; } ignore_entry = object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE && object->type != OBJT_PHYS; VM_OBJECT_RUNLOCK(object); if (ignore_entry) continue; (*func)(entry, closure); } vm_map_unlock_read(map); } /* * Write the core file header to the file, including padding up to * the page boundary. */ static int __elfN(corehdr)(struct coredump_params *p, int numsegs, void *hdr, size_t hdrsize, struct note_info_list *notelst, size_t notesz) { struct note_info *ninfo; struct sbuf *sb; int error; /* Fill in the header. */ bzero(hdr, hdrsize); __elfN(puthdr)(p->td, hdr, hdrsize, numsegs, notesz); sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_core_output, p); sbuf_start_section(sb, NULL); sbuf_bcat(sb, hdr, hdrsize); TAILQ_FOREACH(ninfo, notelst, link) __elfN(putnote)(ninfo, sb); /* Align up to a page boundary for the program segments. */ sbuf_end_section(sb, -1, PAGE_SIZE, 0); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static void __elfN(prepare_notes)(struct thread *td, struct note_info_list *list, size_t *sizep) { struct proc *p; struct thread *thr; size_t size; p = td->td_proc; size = 0; size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p); /* * To have the debugger select the right thread (LWP) as the initial * thread, we dump the state of the thread passed to us in td first. * This is the thread that causes the core dump and thus likely to * be the right thread one wants to have selected in the debugger. */ thr = td; while (thr != NULL) { size += register_note(list, NT_PRSTATUS, __elfN(note_prstatus), thr); size += register_note(list, NT_FPREGSET, __elfN(note_fpregset), thr); size += register_note(list, NT_THRMISC, __elfN(note_thrmisc), thr); size += register_note(list, NT_PTLWPINFO, __elfN(note_ptlwpinfo), thr); size += register_note(list, -1, __elfN(note_threadmd), thr); thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) : TAILQ_NEXT(thr, td_plist); if (thr == td) thr = TAILQ_NEXT(thr, td_plist); } size += register_note(list, NT_PROCSTAT_PROC, __elfN(note_procstat_proc), p); size += register_note(list, NT_PROCSTAT_FILES, note_procstat_files, p); size += register_note(list, NT_PROCSTAT_VMMAP, note_procstat_vmmap, p); size += register_note(list, NT_PROCSTAT_GROUPS, note_procstat_groups, p); size += register_note(list, NT_PROCSTAT_UMASK, note_procstat_umask, p); size += register_note(list, NT_PROCSTAT_RLIMIT, note_procstat_rlimit, p); size += register_note(list, NT_PROCSTAT_OSREL, note_procstat_osrel, p); size += register_note(list, NT_PROCSTAT_PSSTRINGS, __elfN(note_procstat_psstrings), p); size += register_note(list, NT_PROCSTAT_AUXV, __elfN(note_procstat_auxv), p); *sizep = size; } static void __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs, size_t notesz) { Elf_Ehdr *ehdr; Elf_Phdr *phdr; Elf_Shdr *shdr; struct phdr_closure phc; ehdr = (Elf_Ehdr *)hdr; ehdr->e_ident[EI_MAG0] = ELFMAG0; ehdr->e_ident[EI_MAG1] = ELFMAG1; ehdr->e_ident[EI_MAG2] = ELFMAG2; ehdr->e_ident[EI_MAG3] = ELFMAG3; ehdr->e_ident[EI_CLASS] = ELF_CLASS; ehdr->e_ident[EI_DATA] = ELF_DATA; ehdr->e_ident[EI_VERSION] = EV_CURRENT; ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD; ehdr->e_ident[EI_ABIVERSION] = 0; ehdr->e_ident[EI_PAD] = 0; ehdr->e_type = ET_CORE; ehdr->e_machine = td->td_proc->p_elf_machine; ehdr->e_version = EV_CURRENT; ehdr->e_entry = 0; ehdr->e_phoff = sizeof(Elf_Ehdr); ehdr->e_flags = td->td_proc->p_elf_flags; ehdr->e_ehsize = sizeof(Elf_Ehdr); ehdr->e_phentsize = sizeof(Elf_Phdr); ehdr->e_shentsize = sizeof(Elf_Shdr); ehdr->e_shstrndx = SHN_UNDEF; if (numsegs + 1 < PN_XNUM) { ehdr->e_phnum = numsegs + 1; ehdr->e_shnum = 0; } else { ehdr->e_phnum = PN_XNUM; ehdr->e_shnum = 1; ehdr->e_shoff = ehdr->e_phoff + (numsegs + 1) * ehdr->e_phentsize; KASSERT(ehdr->e_shoff == hdrsize - sizeof(Elf_Shdr), ("e_shoff: %zu, hdrsize - shdr: %zu", (size_t)ehdr->e_shoff, hdrsize - sizeof(Elf_Shdr))); shdr = (Elf_Shdr *)((char *)hdr + ehdr->e_shoff); memset(shdr, 0, sizeof(*shdr)); /* * A special first section is used to hold large segment and * section counts. This was proposed by Sun Microsystems in * Solaris and has been adopted by Linux; the standard ELF * tools are already familiar with the technique. * * See table 7-7 of the Solaris "Linker and Libraries Guide" * (or 12-7 depending on the version of the document) for more * details. */ shdr->sh_type = SHT_NULL; shdr->sh_size = ehdr->e_shnum; shdr->sh_link = ehdr->e_shstrndx; shdr->sh_info = numsegs + 1; } /* * Fill in the program header entries. */ phdr = (Elf_Phdr *)((char *)hdr + ehdr->e_phoff); /* The note segement. */ phdr->p_type = PT_NOTE; phdr->p_offset = hdrsize; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = notesz; phdr->p_memsz = 0; phdr->p_flags = PF_R; phdr->p_align = ELF_NOTE_ROUNDSIZE; phdr++; /* All the writable segments from the program. */ phc.phdr = phdr; phc.offset = round_page(hdrsize + notesz); each_dumpable_segment(td, cb_put_phdr, &phc); } static size_t register_note(struct note_info_list *list, int type, outfunc_t out, void *arg) { struct note_info *ninfo; size_t size, notesize; size = 0; out(arg, NULL, &size); ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK); ninfo->type = type; ninfo->outfunc = out; ninfo->outarg = arg; ninfo->outsize = size; TAILQ_INSERT_TAIL(list, ninfo, link); if (type == -1) return (size); notesize = sizeof(Elf_Note) + /* note header */ roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) + /* note name */ roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */ return (notesize); } static size_t append_note_data(const void *src, void *dst, size_t len) { size_t padded_len; padded_len = roundup2(len, ELF_NOTE_ROUNDSIZE); if (dst != NULL) { bcopy(src, dst, len); bzero((char *)dst + len, padded_len - len); } return (padded_len); } size_t __elfN(populate_note)(int type, void *src, void *dst, size_t size, void **descp) { Elf_Note *note; char *buf; size_t notesize; buf = dst; if (buf != NULL) { note = (Elf_Note *)buf; note->n_namesz = sizeof(FREEBSD_ABI_VENDOR); note->n_descsz = size; note->n_type = type; buf += sizeof(*note); buf += append_note_data(FREEBSD_ABI_VENDOR, buf, sizeof(FREEBSD_ABI_VENDOR)); append_note_data(src, buf, size); if (descp != NULL) *descp = buf; } notesize = sizeof(Elf_Note) + /* note header */ roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) + /* note name */ roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */ return (notesize); } static void __elfN(putnote)(struct note_info *ninfo, struct sbuf *sb) { Elf_Note note; ssize_t old_len, sect_len; size_t new_len, descsz, i; if (ninfo->type == -1) { ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize); return; } note.n_namesz = sizeof(FREEBSD_ABI_VENDOR); note.n_descsz = ninfo->outsize; note.n_type = ninfo->type; sbuf_bcat(sb, ¬e, sizeof(note)); sbuf_start_section(sb, &old_len); sbuf_bcat(sb, FREEBSD_ABI_VENDOR, sizeof(FREEBSD_ABI_VENDOR)); sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0); if (note.n_descsz == 0) return; sbuf_start_section(sb, &old_len); ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize); sect_len = sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0); if (sect_len < 0) return; new_len = (size_t)sect_len; descsz = roundup(note.n_descsz, ELF_NOTE_ROUNDSIZE); if (new_len < descsz) { /* * It is expected that individual note emitters will correctly * predict their expected output size and fill up to that size * themselves, padding in a format-specific way if needed. * However, in case they don't, just do it here with zeros. */ for (i = 0; i < descsz - new_len; i++) sbuf_putc(sb, 0); } else if (new_len > descsz) { /* * We can't always truncate sb -- we may have drained some * of it already. */ KASSERT(new_len == descsz, ("%s: Note type %u changed as we " "read it (%zu > %zu). Since it is longer than " "expected, this coredump's notes are corrupt. THIS " "IS A BUG in the note_procstat routine for type %u.\n", __func__, (unsigned)note.n_type, new_len, descsz, (unsigned)note.n_type)); } } /* * Miscellaneous note out functions. */ #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 #include #include typedef struct prstatus32 elf_prstatus_t; typedef struct prpsinfo32 elf_prpsinfo_t; typedef struct fpreg32 elf_prfpregset_t; typedef struct fpreg32 elf_fpregset_t; typedef struct reg32 elf_gregset_t; typedef struct thrmisc32 elf_thrmisc_t; #define ELF_KERN_PROC_MASK KERN_PROC_MASK32 typedef struct kinfo_proc32 elf_kinfo_proc_t; typedef uint32_t elf_ps_strings_t; #else typedef prstatus_t elf_prstatus_t; typedef prpsinfo_t elf_prpsinfo_t; typedef prfpregset_t elf_prfpregset_t; typedef prfpregset_t elf_fpregset_t; typedef gregset_t elf_gregset_t; typedef thrmisc_t elf_thrmisc_t; #define ELF_KERN_PROC_MASK 0 typedef struct kinfo_proc elf_kinfo_proc_t; typedef vm_offset_t elf_ps_strings_t; #endif static void __elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep) { struct sbuf sbarg; size_t len; char *cp, *end; struct proc *p; elf_prpsinfo_t *psinfo; int error; p = (struct proc *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*psinfo), ("invalid size")); psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK); psinfo->pr_version = PRPSINFO_VERSION; psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t); strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname)); PROC_LOCK(p); if (p->p_args != NULL) { len = sizeof(psinfo->pr_psargs) - 1; if (len > p->p_args->ar_length) len = p->p_args->ar_length; memcpy(psinfo->pr_psargs, p->p_args->ar_args, len); PROC_UNLOCK(p); error = 0; } else { _PHOLD(p); PROC_UNLOCK(p); sbuf_new(&sbarg, psinfo->pr_psargs, sizeof(psinfo->pr_psargs), SBUF_FIXEDLEN); error = proc_getargv(curthread, p, &sbarg); PRELE(p); if (sbuf_finish(&sbarg) == 0) len = sbuf_len(&sbarg) - 1; else len = sizeof(psinfo->pr_psargs) - 1; sbuf_delete(&sbarg); } if (error || len == 0) strlcpy(psinfo->pr_psargs, p->p_comm, sizeof(psinfo->pr_psargs)); else { KASSERT(len < sizeof(psinfo->pr_psargs), ("len is too long: %zu vs %zu", len, sizeof(psinfo->pr_psargs))); cp = psinfo->pr_psargs; end = cp + len - 1; for (;;) { cp = memchr(cp, '\0', end - cp); if (cp == NULL) break; *cp = ' '; } } psinfo->pr_pid = p->p_pid; sbuf_bcat(sb, psinfo, sizeof(*psinfo)); free(psinfo, M_TEMP); } *sizep = sizeof(*psinfo); } static void __elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_prstatus_t *status; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*status), ("invalid size")); status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK); status->pr_version = PRSTATUS_VERSION; status->pr_statussz = sizeof(elf_prstatus_t); status->pr_gregsetsz = sizeof(elf_gregset_t); status->pr_fpregsetsz = sizeof(elf_fpregset_t); status->pr_osreldate = osreldate; status->pr_cursig = td->td_proc->p_sig; status->pr_pid = td->td_tid; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 fill_regs32(td, &status->pr_reg); #else fill_regs(td, &status->pr_reg); #endif sbuf_bcat(sb, status, sizeof(*status)); free(status, M_TEMP); } *sizep = sizeof(*status); } static void __elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_prfpregset_t *fpregset; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*fpregset), ("invalid size")); fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK); #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 fill_fpregs32(td, fpregset); #else fill_fpregs(td, fpregset); #endif sbuf_bcat(sb, fpregset, sizeof(*fpregset)); free(fpregset, M_TEMP); } *sizep = sizeof(*fpregset); } static void __elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_thrmisc_t thrmisc; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(thrmisc), ("invalid size")); bzero(&thrmisc._pad, sizeof(thrmisc._pad)); strcpy(thrmisc.pr_tname, td->td_name); sbuf_bcat(sb, &thrmisc, sizeof(thrmisc)); } *sizep = sizeof(thrmisc); } static void __elfN(note_ptlwpinfo)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; size_t size; int structsize; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 struct ptrace_lwpinfo32 pl; #else struct ptrace_lwpinfo pl; #endif td = (struct thread *)arg; size = sizeof(structsize) + sizeof(pl); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(pl); sbuf_bcat(sb, &structsize, sizeof(structsize)); bzero(&pl, sizeof(pl)); pl.pl_lwpid = td->td_tid; pl.pl_event = PL_EVENT_NONE; pl.pl_sigmask = td->td_sigmask; pl.pl_siglist = td->td_siglist; if (td->td_si.si_signo != 0) { pl.pl_event = PL_EVENT_SIGNAL; pl.pl_flags |= PL_FLAG_SI; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 siginfo_to_siginfo32(&td->td_si, &pl.pl_siginfo); #else pl.pl_siginfo = td->td_si; #endif } strcpy(pl.pl_tdname, td->td_name); /* XXX TODO: supply more information in struct ptrace_lwpinfo*/ sbuf_bcat(sb, &pl, sizeof(pl)); } *sizep = size; } /* * Allow for MD specific notes, as well as any MD * specific preparations for writing MI notes. */ static void __elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; void *buf; size_t size; td = (struct thread *)arg; size = *sizep; if (size != 0 && sb != NULL) buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK); else buf = NULL; size = 0; __elfN(dump_thread)(td, buf, &size); KASSERT(sb == NULL || *sizep == size, ("invalid size")); if (size != 0 && sb != NULL) sbuf_bcat(sb, buf, size); free(buf, M_TEMP); *sizep = size; } #ifdef KINFO_PROC_SIZE CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); #endif static void __elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + p->p_numthreads * sizeof(elf_kinfo_proc_t); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(elf_kinfo_proc_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); sx_slock(&proctree_lock); PROC_LOCK(p); kern_proc_out(p, sb, ELF_KERN_PROC_MASK); sx_sunlock(&proctree_lock); } *sizep = size; } #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static void note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size, sect_sz, i; ssize_t start_len, sect_len; int structsize, filedesc_flags; if (coredump_pack_fileinfo) filedesc_flags = KERN_FILEDESC_PACK_KINFO; else filedesc_flags = 0; p = (struct proc *)arg; structsize = sizeof(struct kinfo_file); if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_count, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_filedesc_out(p, sb, -1, filedesc_flags); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { sbuf_start_section(sb, &start_len); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_filedesc_out(p, sb, *sizep - sizeof(structsize), filedesc_flags); sect_len = sbuf_end_section(sb, start_len, 0, 0); if (sect_len < 0) return; sect_sz = sect_len; KASSERT(sect_sz <= *sizep, ("kern_proc_filedesc_out did not respect maxlen; " "requested %zu, got %zu", *sizep - sizeof(structsize), sect_sz - sizeof(structsize))); for (i = 0; i < *sizep - sect_sz && sb->s_error == 0; i++) sbuf_putc(sb, 0); } } #ifdef KINFO_VMENTRY_SIZE CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE); #endif static void note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize, vmmap_flags; if (coredump_pack_vmmapinfo) vmmap_flags = KERN_VMMAP_PACK_KINFO; else vmmap_flags = 0; p = (struct proc *)arg; structsize = sizeof(struct kinfo_vmentry); if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_count, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_vmmap_out(p, sb, -1, vmmap_flags); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_vmmap_out(p, sb, *sizep - sizeof(structsize), vmmap_flags); } } static void note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(gid_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups * sizeof(gid_t)); } *sizep = size; } static void note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(p->p_fd->fd_cmask); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask)); } *sizep = size; } static void note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; struct rlimit rlim[RLIM_NLIMITS]; size_t size; int structsize, i; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(rlim); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(rlim); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); for (i = 0; i < RLIM_NLIMITS; i++) lim_rlimit_proc(p, i, &rlim[i]); PROC_UNLOCK(p); sbuf_bcat(sb, rlim, sizeof(rlim)); } *sizep = size; } static void note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(p->p_osrel); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(p->p_osrel); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel)); } *sizep = size; } static void __elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; elf_ps_strings_t ps_strings; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(ps_strings); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(ps_strings); #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 ps_strings = PTROUT(p->p_sysent->sv_psstrings); #else ps_strings = p->p_sysent->sv_psstrings; #endif sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &ps_strings, sizeof(ps_strings)); } *sizep = size; } static void __elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_count, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PHOLD(p); proc_getauxv(curthread, p, sb); PRELE(p); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { structsize = sizeof(Elf_Auxinfo); sbuf_bcat(sb, &structsize, sizeof(structsize)); PHOLD(p); proc_getauxv(curthread, p, sb); PRELE(p); } } static boolean_t __elfN(parse_notes)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel, const Elf_Phdr *pnote) { const Elf_Note *note, *note0, *note_end; const char *note_name; char *buf; int i, error; boolean_t res; /* We need some limit, might as well use PAGE_SIZE. */ if (pnote == NULL || pnote->p_filesz > PAGE_SIZE) return (FALSE); ASSERT_VOP_LOCKED(imgp->vp, "parse_notes"); if (pnote->p_offset > PAGE_SIZE || pnote->p_filesz > PAGE_SIZE - pnote->p_offset) { VOP_UNLOCK(imgp->vp, 0); buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz, pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, NOCRED, NULL, curthread); if (error != 0) { uprintf("i/o error PT_NOTE\n"); res = FALSE; goto ret; } note = note0 = (const Elf_Note *)buf; note_end = (const Elf_Note *)(buf + pnote->p_filesz); } else { note = note0 = (const Elf_Note *)(imgp->image_header + pnote->p_offset); note_end = (const Elf_Note *)(imgp->image_header + pnote->p_offset + pnote->p_filesz); buf = NULL; } for (i = 0; i < 100 && note >= note0 && note < note_end; i++) { if (!aligned(note, Elf32_Addr) || (const char *)note_end - (const char *)note < sizeof(Elf_Note)) { res = FALSE; goto ret; } if (note->n_namesz != checknote->hdr.n_namesz || note->n_descsz != checknote->hdr.n_descsz || note->n_type != checknote->hdr.n_type) goto nextnote; note_name = (const char *)(note + 1); if (note_name + checknote->hdr.n_namesz >= (const char *)note_end || strncmp(checknote->vendor, note_name, checknote->hdr.n_namesz) != 0) goto nextnote; /* * Fetch the osreldate for binary * from the ELF OSABI-note if necessary. */ if ((checknote->flags & BN_TRANSLATE_OSREL) != 0 && checknote->trans_osrel != NULL) { res = checknote->trans_osrel(note, osrel); goto ret; } res = TRUE; goto ret; nextnote: note = (const Elf_Note *)((const char *)(note + 1) + roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) + roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE)); } res = FALSE; ret: free(buf, M_TEMP); return (res); } /* * Try to find the appropriate ABI-note section for checknote, * fetch the osreldate for binary from the ELF OSABI-note. Only the * first page of the image is searched, the same as for headers. */ static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel) { const Elf_Phdr *phdr; const Elf_Ehdr *hdr; int i; hdr = (const Elf_Ehdr *)imgp->image_header; phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); for (i = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_NOTE && __elfN(parse_notes)(imgp, checknote, osrel, &phdr[i])) return (TRUE); } return (FALSE); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw __elfN(execsw) = { __CONCAT(exec_, __elfN(imgact)), __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) }; EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); static vm_prot_t __elfN(trans_prot)(Elf_Word flags) { vm_prot_t prot; prot = 0; if (flags & PF_X) prot |= VM_PROT_EXECUTE; if (flags & PF_W) prot |= VM_PROT_WRITE; if (flags & PF_R) prot |= VM_PROT_READ; #if __ELF_WORD_SIZE == 32 #if defined(__amd64__) if (i386_read_exec && (flags & PF_R)) prot |= VM_PROT_EXECUTE; #endif #endif return (prot); } static Elf_Word __elfN(untrans_prot)(vm_prot_t prot) { Elf_Word flags; flags = 0; if (prot & VM_PROT_EXECUTE) flags |= PF_X; if (prot & VM_PROT_READ) flags |= PF_R; if (prot & VM_PROT_WRITE) flags |= PF_W; return (flags); } Index: head/sys/kern/imgact_elf32.c =================================================================== --- head/sys/kern/imgact_elf32.c (revision 326270) +++ head/sys/kern/imgact_elf32.c (revision 326271) @@ -1,31 +1,33 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 32 #include Index: head/sys/kern/imgact_elf64.c =================================================================== --- head/sys/kern/imgact_elf64.c (revision 326270) +++ head/sys/kern/imgact_elf64.c (revision 326271) @@ -1,31 +1,33 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 64 #include Index: head/sys/kern/imgact_shell.c =================================================================== --- head/sys/kern/imgact_shell.c (revision 326270) +++ head/sys/kern/imgact_shell.c (revision 326271) @@ -1,257 +1,259 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * At the time of this writing, MAXSHELLCMDLEN == PAGE_SIZE. This is * significant because the caller has only mapped in one page of the * file we're reading. */ #if MAXSHELLCMDLEN > PAGE_SIZE #error "MAXSHELLCMDLEN is larger than a single page!" #endif /* * MAXSHELLCMDLEN must be at least MAXINTERP plus the size of the `#!' * prefix and terminating newline. */ CTASSERT(MAXSHELLCMDLEN >= MAXINTERP + 3); /** * Shell interpreter image activator. An interpreter name beginning at * imgp->args->begin_argv is the minimal successful exit requirement. * * If the given file is a shell-script, then the first line will start * with the two characters `#!' (aka SHELLMAGIC), followed by the name * of the shell-interpreter to run, followed by zero or more tokens. * * The interpreter is then started up such that it will see: * arg[0] -> The name of interpreter as specified after `#!' in the * first line of the script. The interpreter name must * not be longer than MAXSHELLCMDLEN bytes. * arg[1] -> *If* there are any additional tokens on the first line, * then we add a new arg[1], which is a copy of the rest of * that line. The copy starts at the first token after the * interpreter name. We leave it to the interpreter to * parse the tokens in that value. * arg[x] -> the full pathname of the script. This will either be * arg[2] or arg[1], depending on whether or not tokens * were found after the interpreter name. * arg[x+1] -> all the arguments that were specified on the original * command line. * * This processing is described in the execve(2) man page. */ /* * HISTORICAL NOTE: From 1993 to mid-2005, FreeBSD parsed out the tokens as * found on the first line of the script, and setup each token as a separate * value in arg[]. This extra processing did not match the behavior of other * OS's, and caused a few subtle problems. For one, it meant the kernel was * deciding how those values should be parsed (wrt characters for quoting or * comments, etc), while the interpreter might have other rules for parsing. * It also meant the interpreter had no way of knowing which arguments came * from the first line of the shell script, and which arguments were specified * by the user on the command line. That extra processing was dropped in the * 6.x branch on May 28, 2005 (matching __FreeBSD_version 600029). */ int exec_shell_imgact(struct image_params *imgp) { const char *image_header = imgp->image_header; const char *ihp, *interpb, *interpe, *maxp, *optb, *opte, *fname; int error, offset; size_t length; struct vattr vattr; struct sbuf *sname; /* a shell script? */ if (((const short *)image_header)[0] != SHELLMAGIC) return (-1); /* * Don't allow a shell script to be the shell for a shell * script. :-) */ if (imgp->interpreted & IMGACT_SHELL) return (ENOEXEC); imgp->interpreted |= IMGACT_SHELL; /* * At this point we have the first page of the file mapped. * However, we don't know how far into the page the contents are * valid -- the actual file might be much shorter than the page. * So find out the file size. */ error = VOP_GETATTR(imgp->vp, &vattr, imgp->proc->p_ucred); if (error) return (error); /* * Copy shell name and arguments from image_header into a string * buffer. */ maxp = &image_header[MIN(vattr.va_size, MAXSHELLCMDLEN)]; ihp = &image_header[2]; /* * Find the beginning and end of the interpreter_name. If the * line does not include any interpreter, or if the name which * was found is too long, we bail out. */ while (ihp < maxp && ((*ihp == ' ') || (*ihp == '\t'))) ihp++; interpb = ihp; while (ihp < maxp && ((*ihp != ' ') && (*ihp != '\t') && (*ihp != '\n') && (*ihp != '\0'))) ihp++; interpe = ihp; if (interpb == interpe) return (ENOEXEC); if (interpe - interpb >= MAXINTERP) return (ENAMETOOLONG); /* * Find the beginning of the options (if any), and the end-of-line. * Then trim the trailing blanks off the value. Note that some * other operating systems do *not* trim the trailing whitespace... */ while (ihp < maxp && ((*ihp == ' ') || (*ihp == '\t'))) ihp++; optb = ihp; while (ihp < maxp && ((*ihp != '\n') && (*ihp != '\0'))) ihp++; opte = ihp; if (opte == maxp) return (ENOEXEC); while (--ihp > optb && ((*ihp == ' ') || (*ihp == '\t'))) opte = ihp; if (imgp->args->fname != NULL) { fname = imgp->args->fname; sname = NULL; } else { sname = sbuf_new_auto(); sbuf_printf(sname, "/dev/fd/%d", imgp->args->fd); sbuf_finish(sname); fname = sbuf_data(sname); } /* * We need to "pop" (remove) the present value of arg[0], and "push" * either two or three new values in the arg[] list. To do this, * we first shift all the other values in the `begin_argv' area to * provide the exact amount of room for the values added. Set up * `offset' as the number of bytes to be added to the `begin_argv' * area, and 'length' as the number of bytes being removed. */ offset = interpe - interpb + 1; /* interpreter */ if (opte > optb) /* options (if any) */ offset += opte - optb + 1; offset += strlen(fname) + 1; /* fname of script */ length = (imgp->args->argc == 0) ? 0 : strlen(imgp->args->begin_argv) + 1; /* bytes to delete */ if (offset > imgp->args->stringspace + length) { if (sname != NULL) sbuf_delete(sname); return (E2BIG); } bcopy(imgp->args->begin_argv + length, imgp->args->begin_argv + offset, imgp->args->endp - (imgp->args->begin_argv + length)); offset -= length; /* calculate actual adjustment */ imgp->args->begin_envv += offset; imgp->args->endp += offset; imgp->args->stringspace -= offset; /* * If there was no arg[0] when we started, then the interpreter_name * is adding an argument (instead of replacing the arg[0] we started * with). And we're always adding an argument when we include the * full pathname of the original script. */ if (imgp->args->argc == 0) imgp->args->argc = 1; imgp->args->argc++; /* * The original arg[] list has been shifted appropriately. Copy in * the interpreter name and options-string. */ length = interpe - interpb; bcopy(interpb, imgp->args->begin_argv, length); *(imgp->args->begin_argv + length) = '\0'; offset = length + 1; if (opte > optb) { length = opte - optb; bcopy(optb, imgp->args->begin_argv + offset, length); *(imgp->args->begin_argv + offset + length) = '\0'; offset += length + 1; imgp->args->argc++; } /* * Finally, add the filename onto the end for the interpreter to * use and copy the interpreter's name to imgp->interpreter_name * for exec to use. */ error = copystr(fname, imgp->args->begin_argv + offset, imgp->args->stringspace, NULL); if (error == 0) imgp->interpreter_name = imgp->args->begin_argv; if (sname != NULL) sbuf_delete(sname); return (error); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw shell_execsw = { exec_shell_imgact, "#!" }; EXEC_SET(shell, shell_execsw); Index: head/sys/kern/kern_alq.c =================================================================== --- head/sys/kern/kern_alq.c (revision 326270) +++ head/sys/kern/kern_alq.c (revision 326271) @@ -1,973 +1,975 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002, Jeffrey Roberson * Copyright (c) 2008-2009, Lawrence Stewart * Copyright (c) 2009-2010, The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Async. Logging Queue */ struct alq { char *aq_entbuf; /* Buffer for stored entries */ int aq_entmax; /* Max entries */ int aq_entlen; /* Entry length */ int aq_freebytes; /* Bytes available in buffer */ int aq_buflen; /* Total length of our buffer */ int aq_writehead; /* Location for next write */ int aq_writetail; /* Flush starts at this location */ int aq_wrapearly; /* # bytes left blank at end of buf */ int aq_flags; /* Queue flags */ int aq_waiters; /* Num threads waiting for resources * NB: Used as a wait channel so must * not be first field in the alq struct */ struct ale aq_getpost; /* ALE for use by get/post */ struct mtx aq_mtx; /* Queue lock */ struct vnode *aq_vp; /* Open vnode handle */ struct ucred *aq_cred; /* Credentials of the opening thread */ LIST_ENTRY(alq) aq_act; /* List of active queues */ LIST_ENTRY(alq) aq_link; /* List of all queues */ }; #define AQ_WANTED 0x0001 /* Wakeup sleeper when io is done */ #define AQ_ACTIVE 0x0002 /* on the active list */ #define AQ_FLUSHING 0x0004 /* doing IO */ #define AQ_SHUTDOWN 0x0008 /* Queue no longer valid */ #define AQ_ORDERED 0x0010 /* Queue enforces ordered writes */ #define AQ_LEGACY 0x0020 /* Legacy queue (fixed length writes) */ #define ALQ_LOCK(alq) mtx_lock_spin(&(alq)->aq_mtx) #define ALQ_UNLOCK(alq) mtx_unlock_spin(&(alq)->aq_mtx) #define HAS_PENDING_DATA(alq) ((alq)->aq_freebytes != (alq)->aq_buflen) static MALLOC_DEFINE(M_ALD, "ALD", "ALD"); /* * The ald_mtx protects the ald_queues list and the ald_active list. */ static struct mtx ald_mtx; static LIST_HEAD(, alq) ald_queues; static LIST_HEAD(, alq) ald_active; static int ald_shutingdown = 0; struct thread *ald_thread; static struct proc *ald_proc; static eventhandler_tag alq_eventhandler_tag = NULL; #define ALD_LOCK() mtx_lock(&ald_mtx) #define ALD_UNLOCK() mtx_unlock(&ald_mtx) /* Daemon functions */ static int ald_add(struct alq *); static int ald_rem(struct alq *); static void ald_startup(void *); static void ald_daemon(void); static void ald_shutdown(void *, int); static void ald_activate(struct alq *); static void ald_deactivate(struct alq *); /* Internal queue functions */ static void alq_shutdown(struct alq *); static void alq_destroy(struct alq *); static int alq_doio(struct alq *); /* * Add a new queue to the global list. Fail if we're shutting down. */ static int ald_add(struct alq *alq) { int error; error = 0; ALD_LOCK(); if (ald_shutingdown) { error = EBUSY; goto done; } LIST_INSERT_HEAD(&ald_queues, alq, aq_link); done: ALD_UNLOCK(); return (error); } /* * Remove a queue from the global list unless we're shutting down. If so, * the ald will take care of cleaning up it's resources. */ static int ald_rem(struct alq *alq) { int error; error = 0; ALD_LOCK(); if (ald_shutingdown) { error = EBUSY; goto done; } LIST_REMOVE(alq, aq_link); done: ALD_UNLOCK(); return (error); } /* * Put a queue on the active list. This will schedule it for writing. */ static void ald_activate(struct alq *alq) { LIST_INSERT_HEAD(&ald_active, alq, aq_act); wakeup(&ald_active); } static void ald_deactivate(struct alq *alq) { LIST_REMOVE(alq, aq_act); alq->aq_flags &= ~AQ_ACTIVE; } static void ald_startup(void *unused) { mtx_init(&ald_mtx, "ALDmtx", NULL, MTX_DEF|MTX_QUIET); LIST_INIT(&ald_queues); LIST_INIT(&ald_active); } static void ald_daemon(void) { int needwakeup; struct alq *alq; ald_thread = FIRST_THREAD_IN_PROC(ald_proc); alq_eventhandler_tag = EVENTHANDLER_REGISTER(shutdown_pre_sync, ald_shutdown, NULL, SHUTDOWN_PRI_FIRST); ALD_LOCK(); for (;;) { while ((alq = LIST_FIRST(&ald_active)) == NULL && !ald_shutingdown) mtx_sleep(&ald_active, &ald_mtx, PWAIT, "aldslp", 0); /* Don't shutdown until all active ALQs are flushed. */ if (ald_shutingdown && alq == NULL) { ALD_UNLOCK(); break; } ALQ_LOCK(alq); ald_deactivate(alq); ALD_UNLOCK(); needwakeup = alq_doio(alq); ALQ_UNLOCK(alq); if (needwakeup) wakeup_one(alq); ALD_LOCK(); } kproc_exit(0); } static void ald_shutdown(void *arg, int howto) { struct alq *alq; ALD_LOCK(); /* Ensure no new queues can be created. */ ald_shutingdown = 1; /* Shutdown all ALQs prior to terminating the ald_daemon. */ while ((alq = LIST_FIRST(&ald_queues)) != NULL) { LIST_REMOVE(alq, aq_link); ALD_UNLOCK(); alq_shutdown(alq); ALD_LOCK(); } /* At this point, all ALQs are flushed and shutdown. */ /* * Wake ald_daemon so that it exits. It won't be able to do * anything until we mtx_sleep because we hold the ald_mtx. */ wakeup(&ald_active); /* Wait for ald_daemon to exit. */ mtx_sleep(ald_proc, &ald_mtx, PWAIT, "aldslp", 0); ALD_UNLOCK(); } static void alq_shutdown(struct alq *alq) { ALQ_LOCK(alq); /* Stop any new writers. */ alq->aq_flags |= AQ_SHUTDOWN; /* * If the ALQ isn't active but has unwritten data (possible if * the ALQ_NOACTIVATE flag has been used), explicitly activate the * ALQ here so that the pending data gets flushed by the ald_daemon. */ if (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq)) { alq->aq_flags |= AQ_ACTIVE; ALQ_UNLOCK(alq); ALD_LOCK(); ald_activate(alq); ALD_UNLOCK(); ALQ_LOCK(alq); } /* Drain IO */ while (alq->aq_flags & AQ_ACTIVE) { alq->aq_flags |= AQ_WANTED; msleep_spin(alq, &alq->aq_mtx, "aldclose", 0); } ALQ_UNLOCK(alq); vn_close(alq->aq_vp, FWRITE, alq->aq_cred, curthread); crfree(alq->aq_cred); } void alq_destroy(struct alq *alq) { /* Drain all pending IO. */ alq_shutdown(alq); mtx_destroy(&alq->aq_mtx); free(alq->aq_entbuf, M_ALD); free(alq, M_ALD); } /* * Flush all pending data to disk. This operation will block. */ static int alq_doio(struct alq *alq) { struct thread *td; struct mount *mp; struct vnode *vp; struct uio auio; struct iovec aiov[2]; int totlen; int iov; int wrapearly; KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__)); vp = alq->aq_vp; td = curthread; totlen = 0; iov = 1; wrapearly = alq->aq_wrapearly; bzero(&aiov, sizeof(aiov)); bzero(&auio, sizeof(auio)); /* Start the write from the location of our buffer tail pointer. */ aiov[0].iov_base = alq->aq_entbuf + alq->aq_writetail; if (alq->aq_writetail < alq->aq_writehead) { /* Buffer not wrapped. */ totlen = aiov[0].iov_len = alq->aq_writehead - alq->aq_writetail; } else if (alq->aq_writehead == 0) { /* Buffer not wrapped (special case to avoid an empty iov). */ totlen = aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail - wrapearly; } else { /* * Buffer wrapped, requires 2 aiov entries: * - first is from writetail to end of buffer * - second is from start of buffer to writehead */ aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail - wrapearly; iov++; aiov[1].iov_base = alq->aq_entbuf; aiov[1].iov_len = alq->aq_writehead; totlen = aiov[0].iov_len + aiov[1].iov_len; } alq->aq_flags |= AQ_FLUSHING; ALQ_UNLOCK(alq); auio.uio_iov = &aiov[0]; auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_iovcnt = iov; auio.uio_resid = totlen; auio.uio_td = td; /* * Do all of the junk required to write now. */ vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * XXX: VOP_WRITE error checks are ignored. */ #ifdef MAC if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0) #endif VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); ALQ_LOCK(alq); alq->aq_flags &= ~AQ_FLUSHING; /* Adjust writetail as required, taking into account wrapping. */ alq->aq_writetail = (alq->aq_writetail + totlen + wrapearly) % alq->aq_buflen; alq->aq_freebytes += totlen + wrapearly; /* * If we just flushed part of the buffer which wrapped, reset the * wrapearly indicator. */ if (wrapearly) alq->aq_wrapearly = 0; /* * If we just flushed the buffer completely, reset indexes to 0 to * minimise buffer wraps. * This is also required to ensure alq_getn() can't wedge itself. */ if (!HAS_PENDING_DATA(alq)) alq->aq_writehead = alq->aq_writetail = 0; KASSERT((alq->aq_writetail >= 0 && alq->aq_writetail < alq->aq_buflen), ("%s: aq_writetail < 0 || aq_writetail >= aq_buflen", __func__)); if (alq->aq_flags & AQ_WANTED) { alq->aq_flags &= ~AQ_WANTED; return (1); } return(0); } static struct kproc_desc ald_kp = { "ALQ Daemon", ald_daemon, &ald_proc }; SYSINIT(aldthread, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &ald_kp); SYSINIT(ald, SI_SUB_LOCK, SI_ORDER_ANY, ald_startup, NULL); /* User visible queue functions */ /* * Create the queue data structure, allocate the buffer, and open the file. */ int alq_open_flags(struct alq **alqp, const char *file, struct ucred *cred, int cmode, int size, int flags) { struct thread *td; struct nameidata nd; struct alq *alq; int oflags; int error; KASSERT((size > 0), ("%s: size <= 0", __func__)); *alqp = NULL; td = curthread; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td); oflags = FWRITE | O_NOFOLLOW | O_CREAT; error = vn_open_cred(&nd, &oflags, cmode, 0, cred, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); /* We just unlock so we hold a reference */ VOP_UNLOCK(nd.ni_vp, 0); alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO); alq->aq_vp = nd.ni_vp; alq->aq_cred = crhold(cred); mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET); alq->aq_buflen = size; alq->aq_entmax = 0; alq->aq_entlen = 0; alq->aq_freebytes = alq->aq_buflen; alq->aq_entbuf = malloc(alq->aq_buflen, M_ALD, M_WAITOK|M_ZERO); alq->aq_writehead = alq->aq_writetail = 0; if (flags & ALQ_ORDERED) alq->aq_flags |= AQ_ORDERED; if ((error = ald_add(alq)) != 0) { alq_destroy(alq); return (error); } *alqp = alq; return (0); } int alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode, int size, int count) { int ret; KASSERT((count >= 0), ("%s: count < 0", __func__)); if (count > 0) { if ((ret = alq_open_flags(alqp, file, cred, cmode, size*count, 0)) == 0) { (*alqp)->aq_flags |= AQ_LEGACY; (*alqp)->aq_entmax = count; (*alqp)->aq_entlen = size; } } else ret = alq_open_flags(alqp, file, cred, cmode, size, 0); return (ret); } /* * Copy a new entry into the queue. If the operation would block either * wait or return an error depending on the value of waitok. */ int alq_writen(struct alq *alq, void *data, int len, int flags) { int activate, copy, ret; void *waitchan; KASSERT((len > 0 && len <= alq->aq_buflen), ("%s: len <= 0 || len > aq_buflen", __func__)); activate = ret = 0; copy = len; waitchan = NULL; ALQ_LOCK(alq); /* * Fail to perform the write and return EWOULDBLOCK if: * - The message is larger than our underlying buffer. * - The ALQ is being shutdown. * - There is insufficient free space in our underlying buffer * to accept the message and the user can't wait for space. * - There is insufficient free space in our underlying buffer * to accept the message and the alq is inactive due to prior * use of the ALQ_NOACTIVATE flag (which would lead to deadlock). */ if (len > alq->aq_buflen || alq->aq_flags & AQ_SHUTDOWN || (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq))) && alq->aq_freebytes < len)) { ALQ_UNLOCK(alq); return (EWOULDBLOCK); } /* * If we want ordered writes and there is already at least one thread * waiting for resources to become available, sleep until we're woken. */ if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) { KASSERT(!(flags & ALQ_NOWAIT), ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__)); alq->aq_waiters++; msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqwnord", 0); alq->aq_waiters--; } /* * (ALQ_WAITOK && aq_freebytes < len) or aq_freebytes >= len, either * enter while loop and sleep until we have enough free bytes (former) * or skip (latter). If AQ_ORDERED is set, only 1 thread at a time will * be in this loop. Otherwise, multiple threads may be sleeping here * competing for ALQ resources. */ while (alq->aq_freebytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) { KASSERT(!(flags & ALQ_NOWAIT), ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__)); alq->aq_flags |= AQ_WANTED; alq->aq_waiters++; if (waitchan) wakeup(waitchan); msleep_spin(alq, &alq->aq_mtx, "alqwnres", 0); alq->aq_waiters--; /* * If we're the first thread to wake after an AQ_WANTED wakeup * but there isn't enough free space for us, we're going to loop * and sleep again. If there are other threads waiting in this * loop, schedule a wakeup so that they can see if the space * they require is available. */ if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) && alq->aq_freebytes < len && !(alq->aq_flags & AQ_WANTED)) waitchan = alq; else waitchan = NULL; } /* * If there are waiters, we need to signal the waiting threads after we * complete our work. The alq ptr is used as a wait channel for threads * requiring resources to be freed up. In the AQ_ORDERED case, threads * are not allowed to concurrently compete for resources in the above * while loop, so we use a different wait channel in this case. */ if (alq->aq_waiters > 0) { if (alq->aq_flags & AQ_ORDERED) waitchan = &alq->aq_waiters; else waitchan = alq; } else waitchan = NULL; /* Bail if we're shutting down. */ if (alq->aq_flags & AQ_SHUTDOWN) { ret = EWOULDBLOCK; goto unlock; } /* * If we need to wrap the buffer to accommodate the write, * we'll need 2 calls to bcopy. */ if ((alq->aq_buflen - alq->aq_writehead) < len) copy = alq->aq_buflen - alq->aq_writehead; /* Copy message (or part thereof if wrap required) to the buffer. */ bcopy(data, alq->aq_entbuf + alq->aq_writehead, copy); alq->aq_writehead += copy; if (alq->aq_writehead >= alq->aq_buflen) { KASSERT((alq->aq_writehead == alq->aq_buflen), ("%s: alq->aq_writehead (%d) > alq->aq_buflen (%d)", __func__, alq->aq_writehead, alq->aq_buflen)); alq->aq_writehead = 0; } if (copy != len) { /* * Wrap the buffer by copying the remainder of our message * to the start of the buffer and resetting aq_writehead. */ bcopy(((uint8_t *)data)+copy, alq->aq_entbuf, len - copy); alq->aq_writehead = len - copy; } KASSERT((alq->aq_writehead >= 0 && alq->aq_writehead < alq->aq_buflen), ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen", __func__)); alq->aq_freebytes -= len; if (!(alq->aq_flags & AQ_ACTIVE) && !(flags & ALQ_NOACTIVATE)) { alq->aq_flags |= AQ_ACTIVE; activate = 1; } KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__)); unlock: ALQ_UNLOCK(alq); if (activate) { ALD_LOCK(); ald_activate(alq); ALD_UNLOCK(); } /* NB: We rely on wakeup_one waking threads in a FIFO manner. */ if (waitchan != NULL) wakeup_one(waitchan); return (ret); } int alq_write(struct alq *alq, void *data, int flags) { /* Should only be called in fixed length message (legacy) mode. */ KASSERT((alq->aq_flags & AQ_LEGACY), ("%s: fixed length write on variable length queue", __func__)); return (alq_writen(alq, data, alq->aq_entlen, flags)); } /* * Retrieve a pointer for the ALQ to write directly into, avoiding bcopy. */ struct ale * alq_getn(struct alq *alq, int len, int flags) { int contigbytes; void *waitchan; KASSERT((len > 0 && len <= alq->aq_buflen), ("%s: len <= 0 || len > alq->aq_buflen", __func__)); waitchan = NULL; ALQ_LOCK(alq); /* * Determine the number of free contiguous bytes. * We ensure elsewhere that if aq_writehead == aq_writetail because * the buffer is empty, they will both be set to 0 and therefore * aq_freebytes == aq_buflen and is fully contiguous. * If they are equal and the buffer is not empty, aq_freebytes will * be 0 indicating the buffer is full. */ if (alq->aq_writehead <= alq->aq_writetail) contigbytes = alq->aq_freebytes; else { contigbytes = alq->aq_buflen - alq->aq_writehead; if (contigbytes < len) { /* * Insufficient space at end of buffer to handle a * contiguous write. Wrap early if there's space at * the beginning. This will leave a hole at the end * of the buffer which we will have to skip over when * flushing the buffer to disk. */ if (alq->aq_writetail >= len || flags & ALQ_WAITOK) { /* Keep track of # bytes left blank. */ alq->aq_wrapearly = contigbytes; /* Do the wrap and adjust counters. */ contigbytes = alq->aq_freebytes = alq->aq_writetail; alq->aq_writehead = 0; } } } /* * Return a NULL ALE if: * - The message is larger than our underlying buffer. * - The ALQ is being shutdown. * - There is insufficient free space in our underlying buffer * to accept the message and the user can't wait for space. * - There is insufficient free space in our underlying buffer * to accept the message and the alq is inactive due to prior * use of the ALQ_NOACTIVATE flag (which would lead to deadlock). */ if (len > alq->aq_buflen || alq->aq_flags & AQ_SHUTDOWN || (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq))) && contigbytes < len)) { ALQ_UNLOCK(alq); return (NULL); } /* * If we want ordered writes and there is already at least one thread * waiting for resources to become available, sleep until we're woken. */ if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) { KASSERT(!(flags & ALQ_NOWAIT), ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__)); alq->aq_waiters++; msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqgnord", 0); alq->aq_waiters--; } /* * (ALQ_WAITOK && contigbytes < len) or contigbytes >= len, either enter * while loop and sleep until we have enough contiguous free bytes * (former) or skip (latter). If AQ_ORDERED is set, only 1 thread at a * time will be in this loop. Otherwise, multiple threads may be * sleeping here competing for ALQ resources. */ while (contigbytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) { KASSERT(!(flags & ALQ_NOWAIT), ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__)); alq->aq_flags |= AQ_WANTED; alq->aq_waiters++; if (waitchan) wakeup(waitchan); msleep_spin(alq, &alq->aq_mtx, "alqgnres", 0); alq->aq_waiters--; if (alq->aq_writehead <= alq->aq_writetail) contigbytes = alq->aq_freebytes; else contigbytes = alq->aq_buflen - alq->aq_writehead; /* * If we're the first thread to wake after an AQ_WANTED wakeup * but there isn't enough free space for us, we're going to loop * and sleep again. If there are other threads waiting in this * loop, schedule a wakeup so that they can see if the space * they require is available. */ if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) && contigbytes < len && !(alq->aq_flags & AQ_WANTED)) waitchan = alq; else waitchan = NULL; } /* * If there are waiters, we need to signal the waiting threads after we * complete our work. The alq ptr is used as a wait channel for threads * requiring resources to be freed up. In the AQ_ORDERED case, threads * are not allowed to concurrently compete for resources in the above * while loop, so we use a different wait channel in this case. */ if (alq->aq_waiters > 0) { if (alq->aq_flags & AQ_ORDERED) waitchan = &alq->aq_waiters; else waitchan = alq; } else waitchan = NULL; /* Bail if we're shutting down. */ if (alq->aq_flags & AQ_SHUTDOWN) { ALQ_UNLOCK(alq); if (waitchan != NULL) wakeup_one(waitchan); return (NULL); } /* * If we are here, we have a contiguous number of bytes >= len * available in our buffer starting at aq_writehead. */ alq->aq_getpost.ae_data = alq->aq_entbuf + alq->aq_writehead; alq->aq_getpost.ae_bytesused = len; return (&alq->aq_getpost); } struct ale * alq_get(struct alq *alq, int flags) { /* Should only be called in fixed length message (legacy) mode. */ KASSERT((alq->aq_flags & AQ_LEGACY), ("%s: fixed length get on variable length queue", __func__)); return (alq_getn(alq, alq->aq_entlen, flags)); } void alq_post_flags(struct alq *alq, struct ale *ale, int flags) { int activate; void *waitchan; activate = 0; if (ale->ae_bytesused > 0) { if (!(alq->aq_flags & AQ_ACTIVE) && !(flags & ALQ_NOACTIVATE)) { alq->aq_flags |= AQ_ACTIVE; activate = 1; } alq->aq_writehead += ale->ae_bytesused; alq->aq_freebytes -= ale->ae_bytesused; /* Wrap aq_writehead if we filled to the end of the buffer. */ if (alq->aq_writehead == alq->aq_buflen) alq->aq_writehead = 0; KASSERT((alq->aq_writehead >= 0 && alq->aq_writehead < alq->aq_buflen), ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen", __func__)); KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__)); } /* * If there are waiters, we need to signal the waiting threads after we * complete our work. The alq ptr is used as a wait channel for threads * requiring resources to be freed up. In the AQ_ORDERED case, threads * are not allowed to concurrently compete for resources in the * alq_getn() while loop, so we use a different wait channel in this case. */ if (alq->aq_waiters > 0) { if (alq->aq_flags & AQ_ORDERED) waitchan = &alq->aq_waiters; else waitchan = alq; } else waitchan = NULL; ALQ_UNLOCK(alq); if (activate) { ALD_LOCK(); ald_activate(alq); ALD_UNLOCK(); } /* NB: We rely on wakeup_one waking threads in a FIFO manner. */ if (waitchan != NULL) wakeup_one(waitchan); } void alq_flush(struct alq *alq) { int needwakeup = 0; ALD_LOCK(); ALQ_LOCK(alq); /* * Pull the lever iff there is data to flush and we're * not already in the middle of a flush operation. */ if (HAS_PENDING_DATA(alq) && !(alq->aq_flags & AQ_FLUSHING)) { if (alq->aq_flags & AQ_ACTIVE) ald_deactivate(alq); ALD_UNLOCK(); needwakeup = alq_doio(alq); } else ALD_UNLOCK(); ALQ_UNLOCK(alq); if (needwakeup) wakeup_one(alq); } /* * Flush remaining data, close the file and free all resources. */ void alq_close(struct alq *alq) { /* Only flush and destroy alq if not already shutting down. */ if (ald_rem(alq) == 0) alq_destroy(alq); } static int alq_load_handler(module_t mod, int what, void *arg) { int ret; ret = 0; switch (what) { case MOD_LOAD: case MOD_SHUTDOWN: break; case MOD_QUIESCE: ALD_LOCK(); /* Only allow unload if there are no open queues. */ if (LIST_FIRST(&ald_queues) == NULL) { ald_shutingdown = 1; ALD_UNLOCK(); EVENTHANDLER_DEREGISTER(shutdown_pre_sync, alq_eventhandler_tag); ald_shutdown(NULL, 0); mtx_destroy(&ald_mtx); } else { ALD_UNLOCK(); ret = EBUSY; } break; case MOD_UNLOAD: /* If MOD_QUIESCE failed we must fail here too. */ if (ald_shutingdown == 0) ret = EBUSY; break; default: ret = EINVAL; break; } return (ret); } static moduledata_t alq_mod = { "alq", alq_load_handler, NULL }; DECLARE_MODULE(alq, alq_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_VERSION(alq, 1); Index: head/sys/kern/kern_clocksource.c =================================================================== --- head/sys/kern/kern_clocksource.c (revision 326270) +++ head/sys/kern/kern_clocksource.c (revision 326271) @@ -1,967 +1,969 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010-2013 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Common routines to manage event timers hardware. */ #include "opt_device_polling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int cpu_disable_c2_sleep = 0; /* Timer dies in C2. */ int cpu_disable_c3_sleep = 0; /* Timer dies in C3. */ static void setuptimer(void); static void loadtimer(sbintime_t now, int first); static int doconfigtimer(void); static void configtimer(int start); static int round_freq(struct eventtimer *et, int freq); static sbintime_t getnextcpuevent(int idle); static sbintime_t getnextevent(void); static int handleevents(sbintime_t now, int fake); static struct mtx et_hw_mtx; #define ET_HW_LOCK(state) \ { \ if (timer->et_flags & ET_FLAGS_PERCPU) \ mtx_lock_spin(&(state)->et_hw_mtx); \ else \ mtx_lock_spin(&et_hw_mtx); \ } #define ET_HW_UNLOCK(state) \ { \ if (timer->et_flags & ET_FLAGS_PERCPU) \ mtx_unlock_spin(&(state)->et_hw_mtx); \ else \ mtx_unlock_spin(&et_hw_mtx); \ } static struct eventtimer *timer = NULL; static sbintime_t timerperiod; /* Timer period for periodic mode. */ static sbintime_t statperiod; /* statclock() events period. */ static sbintime_t profperiod; /* profclock() events period. */ static sbintime_t nexttick; /* Next global timer tick time. */ static u_int busy = 1; /* Reconfiguration is in progress. */ static int profiling; /* Profiling events enabled. */ static char timername[32]; /* Wanted timer. */ TUNABLE_STR("kern.eventtimer.timer", timername, sizeof(timername)); static int singlemul; /* Multiplier for periodic mode. */ SYSCTL_INT(_kern_eventtimer, OID_AUTO, singlemul, CTLFLAG_RWTUN, &singlemul, 0, "Multiplier for periodic mode"); static u_int idletick; /* Run periodic events when idle. */ SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RWTUN, &idletick, 0, "Run periodic events when idle"); static int periodic; /* Periodic or one-shot mode. */ static int want_periodic; /* What mode to prefer. */ TUNABLE_INT("kern.eventtimer.periodic", &want_periodic); struct pcpu_state { struct mtx et_hw_mtx; /* Per-CPU timer mutex. */ u_int action; /* Reconfiguration requests. */ u_int handle; /* Immediate handle resuests. */ sbintime_t now; /* Last tick time. */ sbintime_t nextevent; /* Next scheduled event on this CPU. */ sbintime_t nexttick; /* Next timer tick time. */ sbintime_t nexthard; /* Next hardclock() event. */ sbintime_t nextstat; /* Next statclock() event. */ sbintime_t nextprof; /* Next profclock() event. */ sbintime_t nextcall; /* Next callout event. */ sbintime_t nextcallopt; /* Next optional callout event. */ int ipi; /* This CPU needs IPI. */ int idle; /* This CPU is in idle mode. */ }; static DPCPU_DEFINE(struct pcpu_state, timerstate); DPCPU_DEFINE(sbintime_t, hardclocktime); /* * Timer broadcast IPI handler. */ int hardclockintr(void) { sbintime_t now; struct pcpu_state *state; int done; if (doconfigtimer() || busy) return (FILTER_HANDLED); state = DPCPU_PTR(timerstate); now = state->now; CTR3(KTR_SPARE2, "ipi at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); done = handleevents(now, 0); return (done ? FILTER_HANDLED : FILTER_STRAY); } /* * Handle all events for specified time on this CPU */ static int handleevents(sbintime_t now, int fake) { sbintime_t t, *hct; struct trapframe *frame; struct pcpu_state *state; int usermode; int done, runs; CTR3(KTR_SPARE2, "handle at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); done = 0; if (fake) { frame = NULL; usermode = 0; } else { frame = curthread->td_intr_frame; usermode = TRAPF_USERMODE(frame); } state = DPCPU_PTR(timerstate); runs = 0; while (now >= state->nexthard) { state->nexthard += tick_sbt; runs++; } if (runs) { hct = DPCPU_PTR(hardclocktime); *hct = state->nexthard - tick_sbt; if (fake < 2) { hardclock_cnt(runs, usermode); done = 1; } } runs = 0; while (now >= state->nextstat) { state->nextstat += statperiod; runs++; } if (runs && fake < 2) { statclock_cnt(runs, usermode); done = 1; } if (profiling) { runs = 0; while (now >= state->nextprof) { state->nextprof += profperiod; runs++; } if (runs && !fake) { profclock_cnt(runs, usermode, TRAPF_PC(frame)); done = 1; } } else state->nextprof = state->nextstat; if (now >= state->nextcallopt || now >= state->nextcall) { state->nextcall = state->nextcallopt = SBT_MAX; callout_process(now); } t = getnextcpuevent(0); ET_HW_LOCK(state); if (!busy) { state->idle = 0; state->nextevent = t; loadtimer(now, (fake == 2) && (timer->et_flags & ET_FLAGS_PERCPU)); } ET_HW_UNLOCK(state); return (done); } /* * Schedule binuptime of the next event on current CPU. */ static sbintime_t getnextcpuevent(int idle) { sbintime_t event; struct pcpu_state *state; u_int hardfreq; state = DPCPU_PTR(timerstate); /* Handle hardclock() events, skipping some if CPU is idle. */ event = state->nexthard; if (idle) { hardfreq = (u_int)hz / 2; if (tc_min_ticktock_freq > 2 #ifdef SMP && curcpu == CPU_FIRST() #endif ) hardfreq = hz / tc_min_ticktock_freq; if (hardfreq > 1) event += tick_sbt * (hardfreq - 1); } /* Handle callout events. */ if (event > state->nextcall) event = state->nextcall; if (!idle) { /* If CPU is active - handle other types of events. */ if (event > state->nextstat) event = state->nextstat; if (profiling && event > state->nextprof) event = state->nextprof; } return (event); } /* * Schedule binuptime of the next event on all CPUs. */ static sbintime_t getnextevent(void) { struct pcpu_state *state; sbintime_t event; #ifdef SMP int cpu; #endif int c; state = DPCPU_PTR(timerstate); event = state->nextevent; c = -1; #ifdef SMP if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); if (event > state->nextevent) { event = state->nextevent; c = cpu; } } } #endif CTR4(KTR_SPARE2, "next at %d: next %d.%08x by %d", curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c); return (event); } /* Hardware timer callback function. */ static void timercb(struct eventtimer *et, void *arg) { sbintime_t now; sbintime_t *next; struct pcpu_state *state; #ifdef SMP int cpu, bcast; #endif /* Do not touch anything if somebody reconfiguring timers. */ if (busy) return; /* Update present and next tick times. */ state = DPCPU_PTR(timerstate); if (et->et_flags & ET_FLAGS_PERCPU) { next = &state->nexttick; } else next = &nexttick; now = sbinuptime(); if (periodic) *next = now + timerperiod; else *next = -1; /* Next tick is not scheduled yet. */ state->now = now; CTR3(KTR_SPARE2, "intr at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); #ifdef SMP #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); #endif /* Prepare broadcasting to other CPUs for non-per-CPU timers. */ bcast = 0; #ifdef EARLY_AP_STARTUP if ((et->et_flags & ET_FLAGS_PERCPU) == 0) { #else if ((et->et_flags & ET_FLAGS_PERCPU) == 0 && smp_started) { #endif CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); state->now = now; if (now >= state->nextevent) { state->nextevent += SBT_1S; if (curcpu != cpu) { state->ipi = 1; bcast = 1; } } ET_HW_UNLOCK(state); } } #endif /* Handle events for this time on this CPU. */ handleevents(now, 0); #ifdef SMP /* Broadcast interrupt to other CPUs for non-per-CPU timers. */ if (bcast) { CPU_FOREACH(cpu) { if (curcpu == cpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); if (state->ipi) { state->ipi = 0; ipi_cpu(cpu, IPI_HARDCLOCK); } } } #endif } /* * Load new value into hardware timer. */ static void loadtimer(sbintime_t now, int start) { struct pcpu_state *state; sbintime_t new; sbintime_t *next; uint64_t tmp; int eq; if (timer->et_flags & ET_FLAGS_PERCPU) { state = DPCPU_PTR(timerstate); next = &state->nexttick; } else next = &nexttick; if (periodic) { if (start) { /* * Try to start all periodic timers aligned * to period to make events synchronous. */ tmp = now % timerperiod; new = timerperiod - tmp; if (new < tmp) /* Left less then passed. */ new += timerperiod; CTR5(KTR_SPARE2, "load p at %d: now %d.%08x first in %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff), (int)(new >> 32), (u_int)(new & 0xffffffff)); *next = new + now; et_start(timer, new, timerperiod); } } else { new = getnextevent(); eq = (new == *next); CTR4(KTR_SPARE2, "load at %d: next %d.%08x eq %d", curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq); if (!eq) { *next = new; et_start(timer, new - now, 0); } } } /* * Prepare event timer parameters after configuration changes. */ static void setuptimer(void) { int freq; if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0) periodic = 0; else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0) periodic = 1; singlemul = MIN(MAX(singlemul, 1), 20); freq = hz * singlemul; while (freq < (profiling ? profhz : stathz)) freq += hz; freq = round_freq(timer, freq); timerperiod = SBT_1S / freq; } /* * Reconfigure specified per-CPU timer on other CPU. Called from IPI handler. */ static int doconfigtimer(void) { sbintime_t now; struct pcpu_state *state; state = DPCPU_PTR(timerstate); switch (atomic_load_acq_int(&state->action)) { case 1: now = sbinuptime(); ET_HW_LOCK(state); loadtimer(now, 1); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); return (1); case 2: ET_HW_LOCK(state); et_stop(timer); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); return (1); } if (atomic_readandclear_int(&state->handle) && !busy) { now = sbinuptime(); handleevents(now, 0); return (1); } return (0); } /* * Reconfigure specified timer. * For per-CPU timers use IPI to make other CPUs to reconfigure. */ static void configtimer(int start) { sbintime_t now, next; struct pcpu_state *state; int cpu; if (start) { setuptimer(); now = sbinuptime(); } else now = 0; critical_enter(); ET_HW_LOCK(DPCPU_PTR(timerstate)); if (start) { /* Initialize time machine parameters. */ next = now + timerperiod; if (periodic) nexttick = next; else nexttick = -1; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); #endif CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); state->now = now; #ifndef EARLY_AP_STARTUP if (!smp_started && cpu != CPU_FIRST()) state->nextevent = SBT_MAX; else #endif state->nextevent = next; if (periodic) state->nexttick = next; else state->nexttick = -1; state->nexthard = next; state->nextstat = next; state->nextprof = next; state->nextcall = next; state->nextcallopt = next; hardclock_sync(cpu); } busy = 0; /* Start global timer or per-CPU timer of this CPU. */ loadtimer(now, 1); } else { busy = 1; /* Stop global timer or per-CPU timer of this CPU. */ et_stop(timer); } ET_HW_UNLOCK(DPCPU_PTR(timerstate)); #ifdef SMP #ifdef EARLY_AP_STARTUP /* If timer is global we are done. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { #else /* If timer is global or there is no other CPUs yet - we are done. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || !smp_started) { #endif critical_exit(); return; } /* Set reconfigure flags for other CPUs. */ CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); atomic_store_rel_int(&state->action, (cpu == curcpu) ? 0 : ( start ? 1 : 2)); } /* Broadcast reconfigure IPI. */ ipi_all_but_self(IPI_HARDCLOCK); /* Wait for reconfiguration completed. */ restart: cpu_spinwait(); CPU_FOREACH(cpu) { if (cpu == curcpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); if (atomic_load_acq_int(&state->action)) goto restart; } #endif critical_exit(); } /* * Calculate nearest frequency supported by hardware timer. */ static int round_freq(struct eventtimer *et, int freq) { uint64_t div; if (et->et_frequency != 0) { div = lmax((et->et_frequency + freq / 2) / freq, 1); if (et->et_flags & ET_FLAGS_POW2DIV) div = 1 << (flsl(div + div / 2) - 1); freq = (et->et_frequency + div / 2) / div; } if (et->et_min_period > SBT_1S) panic("Event timer \"%s\" doesn't support sub-second periods!", et->et_name); else if (et->et_min_period != 0) freq = min(freq, SBT2FREQ(et->et_min_period)); if (et->et_max_period < SBT_1S && et->et_max_period != 0) freq = max(freq, SBT2FREQ(et->et_max_period)); return (freq); } /* * Configure and start event timers (BSP part). */ void cpu_initclocks_bsp(void) { struct pcpu_state *state; int base, div, cpu; mtx_init(&et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); state->nextcall = SBT_MAX; state->nextcallopt = SBT_MAX; } periodic = want_periodic; /* Grab requested timer or the best of present. */ if (timername[0]) timer = et_find(timername, 0, 0); if (timer == NULL && periodic) { timer = et_find(NULL, ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC); } if (timer == NULL) { timer = et_find(NULL, ET_FLAGS_ONESHOT, ET_FLAGS_ONESHOT); } if (timer == NULL && !periodic) { timer = et_find(NULL, ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC); } if (timer == NULL) panic("No usable event timer found!"); et_init(timer, timercb, NULL, NULL); /* Adapt to timer capabilities. */ if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0) periodic = 0; else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0) periodic = 1; if (timer->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep++; /* * We honor the requested 'hz' value. * We want to run stathz in the neighborhood of 128hz. * We would like profhz to run as often as possible. */ if (singlemul <= 0 || singlemul > 20) { if (hz >= 1500 || (hz % 128) == 0) singlemul = 1; else if (hz >= 750) singlemul = 2; else singlemul = 4; } if (periodic) { base = round_freq(timer, hz * singlemul); singlemul = max((base + hz / 2) / hz, 1); hz = (base + singlemul / 2) / singlemul; if (base <= 128) stathz = base; else { div = base / 128; if (div >= singlemul && (div % singlemul) == 0) div++; stathz = base / div; } profhz = stathz; while ((profhz + stathz) <= 128 * 64) profhz += stathz; profhz = round_freq(timer, profhz); } else { hz = round_freq(timer, hz); stathz = round_freq(timer, 127); profhz = round_freq(timer, stathz * 64); } tick = 1000000 / hz; tick_sbt = SBT_1S / hz; tick_bt = sbttobt(tick_sbt); statperiod = SBT_1S / stathz; profperiod = SBT_1S / profhz; ET_LOCK(); configtimer(1); ET_UNLOCK(); } /* * Start per-CPU event timers on APs. */ void cpu_initclocks_ap(void) { sbintime_t now; struct pcpu_state *state; struct thread *td; state = DPCPU_PTR(timerstate); now = sbinuptime(); ET_HW_LOCK(state); state->now = now; hardclock_sync(curcpu); spinlock_enter(); ET_HW_UNLOCK(state); td = curthread; td->td_intr_nesting_level++; handleevents(state->now, 2); td->td_intr_nesting_level--; spinlock_exit(); } /* * Switch to profiling clock rates. */ void cpu_startprofclock(void) { ET_LOCK(); if (profiling == 0) { if (periodic) { configtimer(0); profiling = 1; configtimer(1); } else profiling = 1; } else profiling++; ET_UNLOCK(); } /* * Switch to regular clock rates. */ void cpu_stopprofclock(void) { ET_LOCK(); if (profiling == 1) { if (periodic) { configtimer(0); profiling = 0; configtimer(1); } else profiling = 0; } else profiling--; ET_UNLOCK(); } /* * Switch to idle mode (all ticks handled). */ sbintime_t cpu_idleclock(void) { sbintime_t now, t; struct pcpu_state *state; if (idletick || busy || (periodic && (timer->et_flags & ET_FLAGS_PERCPU)) #ifdef DEVICE_POLLING || curcpu == CPU_FIRST() #endif ) return (-1); state = DPCPU_PTR(timerstate); if (periodic) now = state->now; else now = sbinuptime(); CTR3(KTR_SPARE2, "idle at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); t = getnextcpuevent(1); ET_HW_LOCK(state); state->idle = 1; state->nextevent = t; if (!periodic) loadtimer(now, 0); ET_HW_UNLOCK(state); return (MAX(t - now, 0)); } /* * Switch to active mode (skip empty ticks). */ void cpu_activeclock(void) { sbintime_t now; struct pcpu_state *state; struct thread *td; state = DPCPU_PTR(timerstate); if (state->idle == 0 || busy) return; if (periodic) now = state->now; else now = sbinuptime(); CTR3(KTR_SPARE2, "active at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); spinlock_enter(); td = curthread; td->td_intr_nesting_level++; handleevents(now, 1); td->td_intr_nesting_level--; spinlock_exit(); } /* * Change the frequency of the given timer. This changes et->et_frequency and * if et is the active timer it reconfigures the timer on all CPUs. This is * intended to be a private interface for the use of et_change_frequency() only. */ void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq) { ET_LOCK(); if (et == timer) { configtimer(0); et->et_frequency = newfreq; configtimer(1); } else et->et_frequency = newfreq; ET_UNLOCK(); } void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt) { struct pcpu_state *state; /* Do not touch anything if somebody reconfiguring timers. */ if (busy) return; CTR6(KTR_SPARE2, "new co at %d: on %d at %d.%08x - %d.%08x", curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff), (int)(bt >> 32), (u_int)(bt & 0xffffffff)); KASSERT(!CPU_ABSENT(cpu), ("Absent CPU %d", cpu)); state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); /* * If there is callout time already set earlier -- do nothing. * This check may appear redundant because we check already in * callout_process() but this double check guarantees we're safe * with respect to race conditions between interrupts execution * and scheduling. */ state->nextcallopt = bt_opt; if (bt >= state->nextcall) goto done; state->nextcall = bt; /* If there is some other event set earlier -- do nothing. */ if (bt >= state->nextevent) goto done; state->nextevent = bt; /* If timer is periodic -- there is nothing to reprogram. */ if (periodic) goto done; /* If timer is global or of the current CPU -- reprogram it. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) { loadtimer(sbinuptime(), 0); done: ET_HW_UNLOCK(state); return; } /* Otherwise make other CPU to reprogram it. */ state->handle = 1; ET_HW_UNLOCK(state); #ifdef SMP ipi_cpu(cpu, IPI_HARDCLOCK); #endif } /* * Report or change the active event timers hardware. */ static int sysctl_kern_eventtimer_timer(SYSCTL_HANDLER_ARGS) { char buf[32]; struct eventtimer *et; int error; ET_LOCK(); et = timer; snprintf(buf, sizeof(buf), "%s", et->et_name); ET_UNLOCK(); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); ET_LOCK(); et = timer; if (error != 0 || req->newptr == NULL || strcasecmp(buf, et->et_name) == 0) { ET_UNLOCK(); return (error); } et = et_find(buf, 0, 0); if (et == NULL) { ET_UNLOCK(); return (ENOENT); } configtimer(0); et_free(timer); if (et->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep++; if (timer->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep--; periodic = want_periodic; timer = et; et_init(timer, timercb, NULL, NULL); configtimer(1); ET_UNLOCK(); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, timer, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_timer, "A", "Chosen event timer"); /* * Report or change the active event timer periodicity. */ static int sysctl_kern_eventtimer_periodic(SYSCTL_HANDLER_ARGS) { int error, val; val = periodic; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); ET_LOCK(); configtimer(0); periodic = want_periodic = val; configtimer(1); ET_UNLOCK(); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, periodic, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_periodic, "I", "Enable event timer periodic mode"); #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(clocksource, db_show_clocksource) { struct pcpu_state *st; int c; CPU_FOREACH(c) { st = DPCPU_ID_PTR(c, timerstate); db_printf( "CPU %2d: action %d handle %d ipi %d idle %d\n" " now %#jx nevent %#jx (%jd)\n" " ntick %#jx (%jd) nhard %#jx (%jd)\n" " nstat %#jx (%jd) nprof %#jx (%jd)\n" " ncall %#jx (%jd) ncallopt %#jx (%jd)\n", c, st->action, st->handle, st->ipi, st->idle, (uintmax_t)st->now, (uintmax_t)st->nextevent, (uintmax_t)(st->nextevent - st->now) / tick_sbt, (uintmax_t)st->nexttick, (uintmax_t)(st->nexttick - st->now) / tick_sbt, (uintmax_t)st->nexthard, (uintmax_t)(st->nexthard - st->now) / tick_sbt, (uintmax_t)st->nextstat, (uintmax_t)(st->nextstat - st->now) / tick_sbt, (uintmax_t)st->nextprof, (uintmax_t)(st->nextprof - st->now) / tick_sbt, (uintmax_t)st->nextcall, (uintmax_t)(st->nextcall - st->now) / tick_sbt, (uintmax_t)st->nextcallopt, (uintmax_t)(st->nextcallopt - st->now) / tick_sbt); } } #endif Index: head/sys/kern/kern_condvar.c =================================================================== --- head/sys/kern/kern_condvar.c (revision 326270) +++ head/sys/kern/kern_condvar.c (revision 326271) @@ -1,446 +1,448 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000 Jake Burkholder . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif /* * A bound below which cv_waiters is valid. Once cv_waiters reaches this bound, * cv_signal must manually check the wait queue for threads. */ #define CV_WAITERS_BOUND INT_MAX #define CV_WAITERS_INC(cvp) do { \ if ((cvp)->cv_waiters < CV_WAITERS_BOUND) \ (cvp)->cv_waiters++; \ } while (0) /* * Common sanity checks for cv_wait* functions. */ #define CV_ASSERT(cvp, lock, td) do { \ KASSERT((td) != NULL, ("%s: td NULL", __func__)); \ KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__)); \ KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \ KASSERT((lock) != NULL, ("%s: lock NULL", __func__)); \ } while (0) /* * Initialize a condition variable. Must be called before use. */ void cv_init(struct cv *cvp, const char *desc) { cvp->cv_description = desc; cvp->cv_waiters = 0; } /* * Destroy a condition variable. The condition variable must be re-initialized * in order to be re-used. */ void cv_destroy(struct cv *cvp) { #ifdef INVARIANTS struct sleepqueue *sq; sleepq_lock(cvp); sq = sleepq_lookup(cvp); sleepq_release(cvp); KASSERT(sq == NULL, ("%s: associated sleep queue non-empty", __func__)); #endif } /* * Wait on a condition variable. The current thread is placed on the condition * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same * condition variable will resume the thread. The mutex is released before * sleeping and will be held on return. It is recommended that the mutex be * held when cv_signal or cv_broadcast are called. */ void _cv_wait(struct cv *cvp, struct lock_object *lock) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; uintptr_t lock_state; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED_TD(td)) return; sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } sleepq_wait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } } /* * Wait on a condition variable. This function differs from cv_wait by * not acquiring the mutex after condition variable was signaled. */ void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock) { struct lock_class *class; struct thread *td; td = curthread; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); KASSERT(lock != &Giant.lock_object, ("cv_wait_unlock cannot be used with Giant")); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED_TD(td)) { class->lc_unlock(lock); return; } sleepq_lock(cvp); CV_WAITERS_INC(cvp); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); sleepq_wait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); } /* * Wait on a condition variable, allowing interruption by signals. Return 0 if * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if * a signal was caught. If ERESTART is returned the system call should be * restarted if possible. */ int _cv_wait_sig(struct cv *cvp, struct lock_object *lock) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; uintptr_t lock_state; int rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED_TD(td)) return (0); sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_wait_sig(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Wait on a condition variable for (at most) the value specified in sbt * argument. Returns 0 if the process was resumed by cv_signal or cv_broadcast, * EWOULDBLOCK if the timeout expires. */ int _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt, sbintime_t pr, int flags) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED_TD(td)) return (0); sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_timedwait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Wait on a condition variable for (at most) the value specified in sbt * argument, allowing interruption by signals. * Returns 0 if the thread was resumed by cv_signal or cv_broadcast, * EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if a signal * was caught. */ int _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt, sbintime_t pr, int flags) { WITNESS_SAVE_DECL(lock_witness); struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; lock_state = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, cv_wmesg(cvp)); #endif CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); if (SCHEDULER_STOPPED_TD(td)) return (0); sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_timedwait_sig(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, cv_wmesg(cvp)); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Signal a condition variable, wakes up one waiting thread. Will also wakeup * the swapper if the process is not in memory, so that it can bring the * sleeping process in. Note that this may also result in additional threads * being made runnable. Should be called with the same mutex as was passed to * cv_wait held. */ void cv_signal(struct cv *cvp) { int wakeup_swapper; if (cvp->cv_waiters == 0) return; wakeup_swapper = 0; sleepq_lock(cvp); if (cvp->cv_waiters > 0) { if (cvp->cv_waiters == CV_WAITERS_BOUND && sleepq_lookup(cvp) == NULL) { cvp->cv_waiters = 0; } else { if (cvp->cv_waiters < CV_WAITERS_BOUND) cvp->cv_waiters--; wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0, 0); } } sleepq_release(cvp); if (wakeup_swapper) kick_proc0(); } /* * Broadcast a signal to a condition variable. Wakes up all waiting threads. * Should be called with the same mutex as was passed to cv_wait held. */ void cv_broadcastpri(struct cv *cvp, int pri) { int wakeup_swapper; if (cvp->cv_waiters == 0) return; /* * XXX sleepq_broadcast pri argument changed from -1 meaning * no pri to 0 meaning no pri. */ wakeup_swapper = 0; if (pri == -1) pri = 0; sleepq_lock(cvp); if (cvp->cv_waiters > 0) { cvp->cv_waiters = 0; wakeup_swapper = sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0); } sleepq_release(cvp); if (wakeup_swapper) kick_proc0(); } Index: head/sys/kern/kern_conf.c =================================================================== --- head/sys/kern/kern_conf.c (revision 326270) +++ head/sys/kern/kern_conf.c (revision 326271) @@ -1,1564 +1,1566 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999-2002 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DEVT, "cdev", "cdev storage"); struct mtx devmtx; static void destroy_devl(struct cdev *dev); static int destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg); static void destroy_dev_tq(void *ctx, int pending); static int make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, va_list ap); static struct cdev_priv_list cdevp_free_list = TAILQ_HEAD_INITIALIZER(cdevp_free_list); static SLIST_HEAD(free_cdevsw, cdevsw) cdevsw_gt_post_list = SLIST_HEAD_INITIALIZER(cdevsw_gt_post_list); void dev_lock(void) { mtx_lock(&devmtx); } /* * Free all the memory collected while the cdev mutex was * locked. Since devmtx is after the system map mutex, free() cannot * be called immediately and is postponed until cdev mutex can be * dropped. */ static void dev_unlock_and_free(void) { struct cdev_priv_list cdp_free; struct free_cdevsw csw_free; struct cdev_priv *cdp; struct cdevsw *csw; mtx_assert(&devmtx, MA_OWNED); /* * Make the local copy of the list heads while the dev_mtx is * held. Free it later. */ TAILQ_INIT(&cdp_free); TAILQ_CONCAT(&cdp_free, &cdevp_free_list, cdp_list); csw_free = cdevsw_gt_post_list; SLIST_INIT(&cdevsw_gt_post_list); mtx_unlock(&devmtx); while ((cdp = TAILQ_FIRST(&cdp_free)) != NULL) { TAILQ_REMOVE(&cdp_free, cdp, cdp_list); devfs_free(&cdp->cdp_c); } while ((csw = SLIST_FIRST(&csw_free)) != NULL) { SLIST_REMOVE_HEAD(&csw_free, d_postfree_list); free(csw, M_DEVT); } } static void dev_free_devlocked(struct cdev *cdev) { struct cdev_priv *cdp; mtx_assert(&devmtx, MA_OWNED); cdp = cdev2priv(cdev); KASSERT((cdp->cdp_flags & CDP_UNREF_DTR) == 0, ("destroy_dev() was not called after delist_dev(%p)", cdev)); TAILQ_INSERT_HEAD(&cdevp_free_list, cdp, cdp_list); } static void cdevsw_free_devlocked(struct cdevsw *csw) { mtx_assert(&devmtx, MA_OWNED); SLIST_INSERT_HEAD(&cdevsw_gt_post_list, csw, d_postfree_list); } void dev_unlock(void) { mtx_unlock(&devmtx); } void dev_ref(struct cdev *dev) { mtx_assert(&devmtx, MA_NOTOWNED); mtx_lock(&devmtx); dev->si_refcount++; mtx_unlock(&devmtx); } void dev_refl(struct cdev *dev) { mtx_assert(&devmtx, MA_OWNED); dev->si_refcount++; } void dev_rel(struct cdev *dev) { int flag = 0; mtx_assert(&devmtx, MA_NOTOWNED); dev_lock(); dev->si_refcount--; KASSERT(dev->si_refcount >= 0, ("dev_rel(%s) gave negative count", devtoname(dev))); #if 0 if (dev->si_usecount == 0 && (dev->si_flags & SI_CHEAPCLONE) && (dev->si_flags & SI_NAMED)) ; else #endif if (dev->si_devsw == NULL && dev->si_refcount == 0) { LIST_REMOVE(dev, si_list); flag = 1; } dev_unlock(); if (flag) devfs_free(dev); } struct cdevsw * dev_refthread(struct cdev *dev, int *ref) { struct cdevsw *csw; struct cdev_priv *cdp; mtx_assert(&devmtx, MA_NOTOWNED); if ((dev->si_flags & SI_ETERNAL) != 0) { *ref = 0; return (dev->si_devsw); } dev_lock(); csw = dev->si_devsw; if (csw != NULL) { cdp = cdev2priv(dev); if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) atomic_add_long(&dev->si_threadcount, 1); else csw = NULL; } dev_unlock(); *ref = 1; return (csw); } struct cdevsw * devvn_refthread(struct vnode *vp, struct cdev **devp, int *ref) { struct cdevsw *csw; struct cdev_priv *cdp; struct cdev *dev; mtx_assert(&devmtx, MA_NOTOWNED); if ((vp->v_vflag & VV_ETERNALDEV) != 0) { dev = vp->v_rdev; if (dev == NULL) return (NULL); KASSERT((dev->si_flags & SI_ETERNAL) != 0, ("Not eternal cdev")); *ref = 0; csw = dev->si_devsw; KASSERT(csw != NULL, ("Eternal cdev is destroyed")); *devp = dev; return (csw); } csw = NULL; dev_lock(); dev = vp->v_rdev; if (dev == NULL) { dev_unlock(); return (NULL); } cdp = cdev2priv(dev); if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) { csw = dev->si_devsw; if (csw != NULL) atomic_add_long(&dev->si_threadcount, 1); } dev_unlock(); if (csw != NULL) { *devp = dev; *ref = 1; } return (csw); } void dev_relthread(struct cdev *dev, int ref) { mtx_assert(&devmtx, MA_NOTOWNED); if (!ref) return; KASSERT(dev->si_threadcount > 0, ("%s threadcount is wrong", dev->si_name)); atomic_subtract_rel_long(&dev->si_threadcount, 1); } int nullop(void) { return (0); } int eopnotsupp(void) { return (EOPNOTSUPP); } static int enxio(void) { return (ENXIO); } static int enodev(void) { return (ENODEV); } /* Define a dead_cdevsw for use when devices leave unexpectedly. */ #define dead_open (d_open_t *)enxio #define dead_close (d_close_t *)enxio #define dead_read (d_read_t *)enxio #define dead_write (d_write_t *)enxio #define dead_ioctl (d_ioctl_t *)enxio #define dead_poll (d_poll_t *)enodev #define dead_mmap (d_mmap_t *)enodev static void dead_strategy(struct bio *bp) { biofinish(bp, NULL, ENXIO); } #define dead_dump (dumper_t *)enxio #define dead_kqfilter (d_kqfilter_t *)enxio #define dead_mmap_single (d_mmap_single_t *)enodev static struct cdevsw dead_cdevsw = { .d_version = D_VERSION, .d_open = dead_open, .d_close = dead_close, .d_read = dead_read, .d_write = dead_write, .d_ioctl = dead_ioctl, .d_poll = dead_poll, .d_mmap = dead_mmap, .d_strategy = dead_strategy, .d_name = "dead", .d_dump = dead_dump, .d_kqfilter = dead_kqfilter, .d_mmap_single = dead_mmap_single }; /* Default methods if driver does not specify method */ #define null_open (d_open_t *)nullop #define null_close (d_close_t *)nullop #define no_read (d_read_t *)enodev #define no_write (d_write_t *)enodev #define no_ioctl (d_ioctl_t *)enodev #define no_mmap (d_mmap_t *)enodev #define no_kqfilter (d_kqfilter_t *)enodev #define no_mmap_single (d_mmap_single_t *)enodev static void no_strategy(struct bio *bp) { biofinish(bp, NULL, ENODEV); } static int no_poll(struct cdev *dev __unused, int events, struct thread *td __unused) { return (poll_no_poll(events)); } #define no_dump (dumper_t *)enodev static int giant_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_open(dev, oflags, devtype, td); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_fdopen(struct cdev *dev, int oflags, struct thread *td, struct file *fp) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_fdopen(dev, oflags, td, fp); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_close(dev, fflag, devtype, td); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static void giant_strategy(struct bio *bp) { struct cdevsw *dsw; struct cdev *dev; int ref; dev = bp->bio_dev; dsw = dev_refthread(dev, &ref); if (dsw == NULL) { biofinish(bp, NULL, ENXIO); return; } mtx_lock(&Giant); dsw->d_gianttrick->d_strategy(bp); mtx_unlock(&Giant); dev_relthread(dev, ref); } static int giant_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_ioctl(dev, cmd, data, fflag, td); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_read(struct cdev *dev, struct uio *uio, int ioflag) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_read(dev, uio, ioflag); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_write(struct cdev *dev, struct uio *uio, int ioflag) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_write(dev, uio, ioflag); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_poll(struct cdev *dev, int events, struct thread *td) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_poll(dev, events, td); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_kqfilter(struct cdev *dev, struct knote *kn) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_kqfilter(dev, kn); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_mmap(dev, offset, paddr, nprot, memattr); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static int giant_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, vm_object_t *object, int nprot) { struct cdevsw *dsw; int ref, retval; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); mtx_lock(&Giant); retval = dsw->d_gianttrick->d_mmap_single(dev, offset, size, object, nprot); mtx_unlock(&Giant); dev_relthread(dev, ref); return (retval); } static void notify(struct cdev *dev, const char *ev, int flags) { static const char prefix[] = "cdev="; char *data; int namelen, mflags; if (cold) return; mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK; namelen = strlen(dev->si_name); data = malloc(namelen + sizeof(prefix), M_TEMP, mflags); if (data == NULL) return; memcpy(data, prefix, sizeof(prefix) - 1); memcpy(data + sizeof(prefix) - 1, dev->si_name, namelen + 1); devctl_notify_f("DEVFS", "CDEV", ev, data, mflags); free(data, M_TEMP); } static void notify_create(struct cdev *dev, int flags) { notify(dev, "CREATE", flags); } static void notify_destroy(struct cdev *dev) { notify(dev, "DESTROY", MAKEDEV_WAITOK); } static struct cdev * newdev(struct make_dev_args *args, struct cdev *si) { struct cdev *si2; struct cdevsw *csw; mtx_assert(&devmtx, MA_OWNED); csw = args->mda_devsw; if (csw->d_flags & D_NEEDMINOR) { /* We may want to return an existing device */ LIST_FOREACH(si2, &csw->d_devs, si_list) { if (dev2unit(si2) == args->mda_unit) { dev_free_devlocked(si); return (si2); } } } si->si_drv0 = args->mda_unit; si->si_devsw = csw; si->si_drv1 = args->mda_si_drv1; si->si_drv2 = args->mda_si_drv2; LIST_INSERT_HEAD(&csw->d_devs, si, si_list); return (si); } static void fini_cdevsw(struct cdevsw *devsw) { struct cdevsw *gt; if (devsw->d_gianttrick != NULL) { gt = devsw->d_gianttrick; memcpy(devsw, gt, sizeof *devsw); cdevsw_free_devlocked(gt); devsw->d_gianttrick = NULL; } devsw->d_flags &= ~D_INIT; } static int prep_cdevsw(struct cdevsw *devsw, int flags) { struct cdevsw *dsw2; mtx_assert(&devmtx, MA_OWNED); if (devsw->d_flags & D_INIT) return (0); if (devsw->d_flags & D_NEEDGIANT) { dev_unlock(); dsw2 = malloc(sizeof *dsw2, M_DEVT, (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK); dev_lock(); if (dsw2 == NULL && !(devsw->d_flags & D_INIT)) return (ENOMEM); } else dsw2 = NULL; if (devsw->d_flags & D_INIT) { if (dsw2 != NULL) cdevsw_free_devlocked(dsw2); return (0); } if (devsw->d_version != D_VERSION_03) { printf( "WARNING: Device driver \"%s\" has wrong version %s\n", devsw->d_name == NULL ? "???" : devsw->d_name, "and is disabled. Recompile KLD module."); devsw->d_open = dead_open; devsw->d_close = dead_close; devsw->d_read = dead_read; devsw->d_write = dead_write; devsw->d_ioctl = dead_ioctl; devsw->d_poll = dead_poll; devsw->d_mmap = dead_mmap; devsw->d_mmap_single = dead_mmap_single; devsw->d_strategy = dead_strategy; devsw->d_dump = dead_dump; devsw->d_kqfilter = dead_kqfilter; } if (devsw->d_flags & D_NEEDGIANT) { if (devsw->d_gianttrick == NULL) { memcpy(dsw2, devsw, sizeof *dsw2); devsw->d_gianttrick = dsw2; dsw2 = NULL; } } #define FIXUP(member, noop, giant) \ do { \ if (devsw->member == NULL) { \ devsw->member = noop; \ } else if (devsw->d_flags & D_NEEDGIANT) \ devsw->member = giant; \ } \ while (0) FIXUP(d_open, null_open, giant_open); FIXUP(d_fdopen, NULL, giant_fdopen); FIXUP(d_close, null_close, giant_close); FIXUP(d_read, no_read, giant_read); FIXUP(d_write, no_write, giant_write); FIXUP(d_ioctl, no_ioctl, giant_ioctl); FIXUP(d_poll, no_poll, giant_poll); FIXUP(d_mmap, no_mmap, giant_mmap); FIXUP(d_strategy, no_strategy, giant_strategy); FIXUP(d_kqfilter, no_kqfilter, giant_kqfilter); FIXUP(d_mmap_single, no_mmap_single, giant_mmap_single); if (devsw->d_dump == NULL) devsw->d_dump = no_dump; LIST_INIT(&devsw->d_devs); devsw->d_flags |= D_INIT; if (dsw2 != NULL) cdevsw_free_devlocked(dsw2); return (0); } static int prep_devname(struct cdev *dev, const char *fmt, va_list ap) { int len; char *from, *q, *s, *to; mtx_assert(&devmtx, MA_OWNED); len = vsnrprintf(dev->si_name, sizeof(dev->si_name), 32, fmt, ap); if (len > sizeof(dev->si_name) - 1) return (ENAMETOOLONG); /* Strip leading slashes. */ for (from = dev->si_name; *from == '/'; from++) ; for (to = dev->si_name; *from != '\0'; from++, to++) { /* * Spaces and double quotation marks cause * problems for the devctl(4) protocol. * Reject names containing those characters. */ if (isspace(*from) || *from == '"') return (EINVAL); /* Treat multiple sequential slashes as single. */ while (from[0] == '/' && from[1] == '/') from++; /* Trailing slash is considered invalid. */ if (from[0] == '/' && from[1] == '\0') return (EINVAL); *to = *from; } *to = '\0'; if (dev->si_name[0] == '\0') return (EINVAL); /* Disallow "." and ".." components. */ for (s = dev->si_name;;) { for (q = s; *q != '/' && *q != '\0'; q++) ; if (q - s == 1 && s[0] == '.') return (EINVAL); if (q - s == 2 && s[0] == '.' && s[1] == '.') return (EINVAL); if (*q != '/') break; s = q + 1; } if (devfs_dev_exists(dev->si_name) != 0) return (EEXIST); return (0); } void make_dev_args_init_impl(struct make_dev_args *args, size_t sz) { bzero(args, sz); args->mda_size = sz; } static int make_dev_sv(struct make_dev_args *args1, struct cdev **dres, const char *fmt, va_list ap) { struct cdev *dev, *dev_new; struct make_dev_args args; int res; bzero(&args, sizeof(args)); if (sizeof(args) < args1->mda_size) return (EINVAL); bcopy(args1, &args, args1->mda_size); KASSERT((args.mda_flags & MAKEDEV_WAITOK) == 0 || (args.mda_flags & MAKEDEV_NOWAIT) == 0, ("make_dev_sv: both WAITOK and NOWAIT specified")); dev_new = devfs_alloc(args.mda_flags); if (dev_new == NULL) return (ENOMEM); dev_lock(); res = prep_cdevsw(args.mda_devsw, args.mda_flags); if (res != 0) { dev_unlock(); devfs_free(dev_new); return (res); } dev = newdev(&args, dev_new); if ((dev->si_flags & SI_NAMED) == 0) { res = prep_devname(dev, fmt, ap); if (res != 0) { if ((args.mda_flags & MAKEDEV_CHECKNAME) == 0) { panic( "make_dev_sv: bad si_name (error=%d, si_name=%s)", res, dev->si_name); } if (dev == dev_new) { LIST_REMOVE(dev, si_list); dev_unlock(); devfs_free(dev); } else dev_unlock(); return (res); } } if ((args.mda_flags & MAKEDEV_REF) != 0) dev_refl(dev); if ((args.mda_flags & MAKEDEV_ETERNAL) != 0) dev->si_flags |= SI_ETERNAL; if (dev->si_flags & SI_CHEAPCLONE && dev->si_flags & SI_NAMED) { /* * This is allowed as it removes races and generally * simplifies cloning devices. * XXX: still ?? */ dev_unlock_and_free(); *dres = dev; return (0); } KASSERT(!(dev->si_flags & SI_NAMED), ("make_dev() by driver %s on pre-existing device (min=%x, name=%s)", args.mda_devsw->d_name, dev2unit(dev), devtoname(dev))); dev->si_flags |= SI_NAMED; if (args.mda_cr != NULL) dev->si_cred = crhold(args.mda_cr); dev->si_uid = args.mda_uid; dev->si_gid = args.mda_gid; dev->si_mode = args.mda_mode; devfs_create(dev); clean_unrhdrl(devfs_inos); dev_unlock_and_free(); notify_create(dev, args.mda_flags); *dres = dev; return (0); } int make_dev_s(struct make_dev_args *args, struct cdev **dres, const char *fmt, ...) { va_list ap; int res; va_start(ap, fmt); res = make_dev_sv(args, dres, fmt, ap); va_end(ap); return (res); } static int make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, va_list ap) { struct make_dev_args args; make_dev_args_init(&args); args.mda_flags = flags; args.mda_devsw = devsw; args.mda_cr = cr; args.mda_uid = uid; args.mda_gid = gid; args.mda_mode = mode; args.mda_unit = unit; return (make_dev_sv(&args, dres, fmt, ap)); } struct cdev * make_dev(struct cdevsw *devsw, int unit, uid_t uid, gid_t gid, int mode, const char *fmt, ...) { struct cdev *dev; va_list ap; int res; va_start(ap, fmt); res = make_dev_credv(0, &dev, devsw, unit, NULL, uid, gid, mode, fmt, ap); va_end(ap); KASSERT(res == 0 && dev != NULL, ("make_dev: failed make_dev_credv (error=%d)", res)); return (dev); } struct cdev * make_dev_cred(struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...) { struct cdev *dev; va_list ap; int res; va_start(ap, fmt); res = make_dev_credv(0, &dev, devsw, unit, cr, uid, gid, mode, fmt, ap); va_end(ap); KASSERT(res == 0 && dev != NULL, ("make_dev_cred: failed make_dev_credv (error=%d)", res)); return (dev); } struct cdev * make_dev_credf(int flags, struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...) { struct cdev *dev; va_list ap; int res; va_start(ap, fmt); res = make_dev_credv(flags, &dev, devsw, unit, cr, uid, gid, mode, fmt, ap); va_end(ap); KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) || ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0, ("make_dev_credf: failed make_dev_credv (error=%d)", res)); return (res == 0 ? dev : NULL); } int make_dev_p(int flags, struct cdev **cdev, struct cdevsw *devsw, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...) { va_list ap; int res; va_start(ap, fmt); res = make_dev_credv(flags, cdev, devsw, 0, cr, uid, gid, mode, fmt, ap); va_end(ap); KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) || ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0, ("make_dev_p: failed make_dev_credv (error=%d)", res)); return (res); } static void dev_dependsl(struct cdev *pdev, struct cdev *cdev) { cdev->si_parent = pdev; cdev->si_flags |= SI_CHILD; LIST_INSERT_HEAD(&pdev->si_children, cdev, si_siblings); } void dev_depends(struct cdev *pdev, struct cdev *cdev) { dev_lock(); dev_dependsl(pdev, cdev); dev_unlock(); } static int make_dev_alias_v(int flags, struct cdev **cdev, struct cdev *pdev, const char *fmt, va_list ap) { struct cdev *dev; int error; KASSERT(pdev != NULL, ("make_dev_alias_v: pdev is NULL")); KASSERT((flags & MAKEDEV_WAITOK) == 0 || (flags & MAKEDEV_NOWAIT) == 0, ("make_dev_alias_v: both WAITOK and NOWAIT specified")); KASSERT((flags & ~(MAKEDEV_WAITOK | MAKEDEV_NOWAIT | MAKEDEV_CHECKNAME)) == 0, ("make_dev_alias_v: invalid flags specified (flags=%02x)", flags)); dev = devfs_alloc(flags); if (dev == NULL) return (ENOMEM); dev_lock(); dev->si_flags |= SI_ALIAS; error = prep_devname(dev, fmt, ap); if (error != 0) { if ((flags & MAKEDEV_CHECKNAME) == 0) { panic("make_dev_alias_v: bad si_name " "(error=%d, si_name=%s)", error, dev->si_name); } dev_unlock(); devfs_free(dev); return (error); } dev->si_flags |= SI_NAMED; devfs_create(dev); dev_dependsl(pdev, dev); clean_unrhdrl(devfs_inos); dev_unlock(); notify_create(dev, flags); *cdev = dev; return (0); } struct cdev * make_dev_alias(struct cdev *pdev, const char *fmt, ...) { struct cdev *dev; va_list ap; int res; va_start(ap, fmt); res = make_dev_alias_v(MAKEDEV_WAITOK, &dev, pdev, fmt, ap); va_end(ap); KASSERT(res == 0 && dev != NULL, ("make_dev_alias: failed make_dev_alias_v (error=%d)", res)); return (dev); } int make_dev_alias_p(int flags, struct cdev **cdev, struct cdev *pdev, const char *fmt, ...) { va_list ap; int res; va_start(ap, fmt); res = make_dev_alias_v(flags, cdev, pdev, fmt, ap); va_end(ap); return (res); } int make_dev_physpath_alias(int flags, struct cdev **cdev, struct cdev *pdev, struct cdev *old_alias, const char *physpath) { char *devfspath; int physpath_len; int max_parentpath_len; int parentpath_len; int devfspathbuf_len; int mflags; int ret; *cdev = NULL; devfspath = NULL; physpath_len = strlen(physpath); ret = EINVAL; if (physpath_len == 0) goto out; if (strncmp("id1,", physpath, 4) == 0) { physpath += 4; physpath_len -= 4; if (physpath_len == 0) goto out; } max_parentpath_len = SPECNAMELEN - physpath_len - /*/*/1; parentpath_len = strlen(pdev->si_name); if (max_parentpath_len < parentpath_len) { if (bootverbose) printf("WARNING: Unable to alias %s " "to %s/%s - path too long\n", pdev->si_name, physpath, pdev->si_name); ret = ENAMETOOLONG; goto out; } mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK; devfspathbuf_len = physpath_len + /*/*/1 + parentpath_len + /*NUL*/1; devfspath = malloc(devfspathbuf_len, M_DEVBUF, mflags); if (devfspath == NULL) { ret = ENOMEM; goto out; } sprintf(devfspath, "%s/%s", physpath, pdev->si_name); if (old_alias != NULL && strcmp(old_alias->si_name, devfspath) == 0) { /* Retain the existing alias. */ *cdev = old_alias; old_alias = NULL; ret = 0; } else { ret = make_dev_alias_p(flags, cdev, pdev, "%s", devfspath); } out: if (old_alias != NULL) destroy_dev(old_alias); if (devfspath != NULL) free(devfspath, M_DEVBUF); return (ret); } static void destroy_devl(struct cdev *dev) { struct cdevsw *csw; struct cdev_privdata *p; struct cdev_priv *cdp; mtx_assert(&devmtx, MA_OWNED); KASSERT(dev->si_flags & SI_NAMED, ("WARNING: Driver mistake: destroy_dev on %d\n", dev2unit(dev))); KASSERT((dev->si_flags & SI_ETERNAL) == 0, ("WARNING: Driver mistake: destroy_dev on eternal %d\n", dev2unit(dev))); cdp = cdev2priv(dev); if ((cdp->cdp_flags & CDP_UNREF_DTR) == 0) { /* * Avoid race with dev_rel(), e.g. from the populate * loop. If CDP_UNREF_DTR flag is set, the reference * to be dropped at the end of destroy_devl() was * already taken by delist_dev_locked(). */ dev_refl(dev); devfs_destroy(dev); } /* Remove name marking */ dev->si_flags &= ~SI_NAMED; /* If we are a child, remove us from the parents list */ if (dev->si_flags & SI_CHILD) { LIST_REMOVE(dev, si_siblings); dev->si_flags &= ~SI_CHILD; } /* Kill our children */ while (!LIST_EMPTY(&dev->si_children)) destroy_devl(LIST_FIRST(&dev->si_children)); /* Remove from clone list */ if (dev->si_flags & SI_CLONELIST) { LIST_REMOVE(dev, si_clone); dev->si_flags &= ~SI_CLONELIST; } csw = dev->si_devsw; dev->si_devsw = NULL; /* already NULL for SI_ALIAS */ while (csw != NULL && csw->d_purge != NULL && dev->si_threadcount) { csw->d_purge(dev); msleep(csw, &devmtx, PRIBIO, "devprg", hz/10); if (dev->si_threadcount) printf("Still %lu threads in %s\n", dev->si_threadcount, devtoname(dev)); } while (dev->si_threadcount != 0) { /* Use unique dummy wait ident */ msleep(&csw, &devmtx, PRIBIO, "devdrn", hz / 10); } dev_unlock(); if ((cdp->cdp_flags & CDP_UNREF_DTR) == 0) { /* avoid out of order notify events */ notify_destroy(dev); } mtx_lock(&cdevpriv_mtx); while ((p = LIST_FIRST(&cdp->cdp_fdpriv)) != NULL) { devfs_destroy_cdevpriv(p); mtx_lock(&cdevpriv_mtx); } mtx_unlock(&cdevpriv_mtx); dev_lock(); dev->si_drv1 = 0; dev->si_drv2 = 0; bzero(&dev->__si_u, sizeof(dev->__si_u)); if (!(dev->si_flags & SI_ALIAS)) { /* Remove from cdevsw list */ LIST_REMOVE(dev, si_list); /* If cdevsw has no more struct cdev *'s, clean it */ if (LIST_EMPTY(&csw->d_devs)) { fini_cdevsw(csw); wakeup(&csw->d_devs); } } dev->si_flags &= ~SI_ALIAS; cdp->cdp_flags &= ~CDP_UNREF_DTR; dev->si_refcount--; if (dev->si_refcount > 0) LIST_INSERT_HEAD(&dead_cdevsw.d_devs, dev, si_list); else dev_free_devlocked(dev); } static void delist_dev_locked(struct cdev *dev) { struct cdev_priv *cdp; struct cdev *child; mtx_assert(&devmtx, MA_OWNED); cdp = cdev2priv(dev); if ((cdp->cdp_flags & CDP_UNREF_DTR) != 0) return; cdp->cdp_flags |= CDP_UNREF_DTR; dev_refl(dev); devfs_destroy(dev); LIST_FOREACH(child, &dev->si_children, si_siblings) delist_dev_locked(child); dev_unlock(); /* ensure the destroy event is queued in order */ notify_destroy(dev); dev_lock(); } /* * This function will delist a character device and its children from * the directory listing and create a destroy event without waiting * for all character device references to go away. At some later point * destroy_dev() must be called to complete the character device * destruction. After calling this function the character device name * can instantly be re-used. */ void delist_dev(struct cdev *dev) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "delist_dev"); dev_lock(); delist_dev_locked(dev); dev_unlock(); } void destroy_dev(struct cdev *dev) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "destroy_dev"); dev_lock(); destroy_devl(dev); dev_unlock_and_free(); } const char * devtoname(struct cdev *dev) { return (dev->si_name); } int dev_stdclone(char *name, char **namep, const char *stem, int *unit) { int u, i; i = strlen(stem); if (bcmp(stem, name, i) != 0) return (0); if (!isdigit(name[i])) return (0); u = 0; if (name[i] == '0' && isdigit(name[i+1])) return (0); while (isdigit(name[i])) { u *= 10; u += name[i++] - '0'; } if (u > 0xffffff) return (0); *unit = u; if (namep) *namep = &name[i]; if (name[i]) return (2); return (1); } /* * Helper functions for cloning device drivers. * * The objective here is to make it unnecessary for the device drivers to * use rman or similar to manage their unit number space. Due to the way * we do "on-demand" devices, using rman or other "private" methods * will be very tricky to lock down properly once we lock down this file. * * Instead we give the drivers these routines which puts the struct cdev *'s * that are to be managed on their own list, and gives the driver the ability * to ask for the first free unit number or a given specified unit number. * * In addition these routines support paired devices (pty, nmdm and similar) * by respecting a number of "flag" bits in the minor number. * */ struct clonedevs { LIST_HEAD(,cdev) head; }; void clone_setup(struct clonedevs **cdp) { *cdp = malloc(sizeof **cdp, M_DEVBUF, M_WAITOK | M_ZERO); LIST_INIT(&(*cdp)->head); } int clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up, struct cdev **dp, int extra) { struct clonedevs *cd; struct cdev *dev, *ndev, *dl, *de; struct make_dev_args args; int unit, low, u; KASSERT(*cdp != NULL, ("clone_setup() not called in driver \"%s\"", csw->d_name)); KASSERT(!(extra & CLONE_UNITMASK), ("Illegal extra bits (0x%x) in clone_create", extra)); KASSERT(*up <= CLONE_UNITMASK, ("Too high unit (0x%x) in clone_create", *up)); KASSERT(csw->d_flags & D_NEEDMINOR, ("clone_create() on cdevsw without minor numbers")); /* * Search the list for a lot of things in one go: * A preexisting match is returned immediately. * The lowest free unit number if we are passed -1, and the place * in the list where we should insert that new element. * The place to insert a specified unit number, if applicable * the end of the list. */ unit = *up; ndev = devfs_alloc(MAKEDEV_WAITOK); dev_lock(); prep_cdevsw(csw, MAKEDEV_WAITOK); low = extra; de = dl = NULL; cd = *cdp; LIST_FOREACH(dev, &cd->head, si_clone) { KASSERT(dev->si_flags & SI_CLONELIST, ("Dev %p(%s) should be on clonelist", dev, dev->si_name)); u = dev2unit(dev); if (u == (unit | extra)) { *dp = dev; dev_unlock(); devfs_free(ndev); return (0); } if (unit == -1 && u == low) { low++; de = dev; continue; } else if (u < (unit | extra)) { de = dev; continue; } else if (u > (unit | extra)) { dl = dev; break; } } if (unit == -1) unit = low & CLONE_UNITMASK; make_dev_args_init(&args); args.mda_unit = unit | extra; args.mda_devsw = csw; dev = newdev(&args, ndev); if (dev->si_flags & SI_CLONELIST) { printf("dev %p (%s) is on clonelist\n", dev, dev->si_name); printf("unit=%d, low=%d, extra=0x%x\n", unit, low, extra); LIST_FOREACH(dev, &cd->head, si_clone) { printf("\t%p %s\n", dev, dev->si_name); } panic("foo"); } KASSERT(!(dev->si_flags & SI_CLONELIST), ("Dev %p(%s) should not be on clonelist", dev, dev->si_name)); if (dl != NULL) LIST_INSERT_BEFORE(dl, dev, si_clone); else if (de != NULL) LIST_INSERT_AFTER(de, dev, si_clone); else LIST_INSERT_HEAD(&cd->head, dev, si_clone); dev->si_flags |= SI_CLONELIST; *up = unit; dev_unlock_and_free(); return (1); } /* * Kill everything still on the list. The driver should already have * disposed of any softc hung of the struct cdev *'s at this time. */ void clone_cleanup(struct clonedevs **cdp) { struct cdev *dev; struct cdev_priv *cp; struct clonedevs *cd; cd = *cdp; if (cd == NULL) return; dev_lock(); while (!LIST_EMPTY(&cd->head)) { dev = LIST_FIRST(&cd->head); LIST_REMOVE(dev, si_clone); KASSERT(dev->si_flags & SI_CLONELIST, ("Dev %p(%s) should be on clonelist", dev, dev->si_name)); dev->si_flags &= ~SI_CLONELIST; cp = cdev2priv(dev); if (!(cp->cdp_flags & CDP_SCHED_DTR)) { cp->cdp_flags |= CDP_SCHED_DTR; KASSERT(dev->si_flags & SI_NAMED, ("Driver has goofed in cloning underways udev %jx unit %x", (uintmax_t)dev2udev(dev), dev2unit(dev))); destroy_devl(dev); } } dev_unlock_and_free(); free(cd, M_DEVBUF); *cdp = NULL; } static TAILQ_HEAD(, cdev_priv) dev_ddtr = TAILQ_HEAD_INITIALIZER(dev_ddtr); static struct task dev_dtr_task = TASK_INITIALIZER(0, destroy_dev_tq, NULL); static void destroy_dev_tq(void *ctx, int pending) { struct cdev_priv *cp; struct cdev *dev; void (*cb)(void *); void *cb_arg; dev_lock(); while (!TAILQ_EMPTY(&dev_ddtr)) { cp = TAILQ_FIRST(&dev_ddtr); dev = &cp->cdp_c; KASSERT(cp->cdp_flags & CDP_SCHED_DTR, ("cdev %p in dev_destroy_tq without CDP_SCHED_DTR", cp)); TAILQ_REMOVE(&dev_ddtr, cp, cdp_dtr_list); cb = cp->cdp_dtr_cb; cb_arg = cp->cdp_dtr_cb_arg; destroy_devl(dev); dev_unlock_and_free(); dev_rel(dev); if (cb != NULL) cb(cb_arg); dev_lock(); } dev_unlock(); } /* * devmtx shall be locked on entry. devmtx will be unlocked after * function return. */ static int destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg) { struct cdev_priv *cp; mtx_assert(&devmtx, MA_OWNED); cp = cdev2priv(dev); if (cp->cdp_flags & CDP_SCHED_DTR) { dev_unlock(); return (0); } dev_refl(dev); cp->cdp_flags |= CDP_SCHED_DTR; cp->cdp_dtr_cb = cb; cp->cdp_dtr_cb_arg = arg; TAILQ_INSERT_TAIL(&dev_ddtr, cp, cdp_dtr_list); dev_unlock(); taskqueue_enqueue(taskqueue_swi_giant, &dev_dtr_task); return (1); } int destroy_dev_sched_cb(struct cdev *dev, void (*cb)(void *), void *arg) { dev_lock(); return (destroy_dev_sched_cbl(dev, cb, arg)); } int destroy_dev_sched(struct cdev *dev) { return (destroy_dev_sched_cb(dev, NULL, NULL)); } void destroy_dev_drain(struct cdevsw *csw) { dev_lock(); while (!LIST_EMPTY(&csw->d_devs)) { msleep(&csw->d_devs, &devmtx, PRIBIO, "devscd", hz/10); } dev_unlock(); } void drain_dev_clone_events(void) { sx_xlock(&clone_drain_lock); sx_xunlock(&clone_drain_lock); } #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(cdev, db_show_cdev) { struct cdev_priv *cdp; struct cdev *dev; u_int flags; char buf[512]; if (!have_addr) { TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) { dev = &cdp->cdp_c; db_printf("%s %p\n", dev->si_name, dev); if (db_pager_quit) break; } return; } dev = (struct cdev *)addr; cdp = cdev2priv(dev); db_printf("dev %s ref %d use %ld thr %ld inuse %u fdpriv %p\n", dev->si_name, dev->si_refcount, dev->si_usecount, dev->si_threadcount, cdp->cdp_inuse, cdp->cdp_fdpriv.lh_first); db_printf("devsw %p si_drv0 %d si_drv1 %p si_drv2 %p\n", dev->si_devsw, dev->si_drv0, dev->si_drv1, dev->si_drv2); flags = dev->si_flags; #define SI_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 3, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) buf[0] = '\0'; SI_FLAG(SI_ETERNAL); SI_FLAG(SI_ALIAS); SI_FLAG(SI_NAMED); SI_FLAG(SI_CHEAPCLONE); SI_FLAG(SI_CHILD); SI_FLAG(SI_DUMPDEV); SI_FLAG(SI_CLONELIST); db_printf("si_flags %s\n", buf); flags = cdp->cdp_flags; #define CDP_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) buf[0] = '\0'; CDP_FLAG(CDP_ACTIVE); CDP_FLAG(CDP_SCHED_DTR); db_printf("cdp_flags %s\n", buf); } #endif Index: head/sys/kern/kern_context.c =================================================================== --- head/sys/kern/kern_context.c (revision 326270) +++ head/sys/kern/kern_context.c (revision 326271) @@ -1,129 +1,131 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Daniel M. Eischen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include /* * The first two fields of a ucontext_t are the signal mask and the machine * context. The next field is uc_link; we want to avoid destroying the link * when copying out contexts. */ #define UC_COPY_SIZE offsetof(ucontext_t, uc_link) #ifndef _SYS_SYSPROTO_H_ struct getcontext_args { struct __ucontext *ucp; } struct setcontext_args { const struct __ucontext_t *ucp; } struct swapcontext_args { struct __ucontext *oucp; const struct __ucontext_t *ucp; } #endif int sys_getcontext(struct thread *td, struct getcontext_args *uap) { ucontext_t uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); bzero(uc.__spare__, sizeof(uc.__spare__)); ret = copyout(&uc, uap->ucp, UC_COPY_SIZE); } return (ret); } int sys_setcontext(struct thread *td, struct setcontext_args *uap) { ucontext_t uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } return (ret == 0 ? EJUSTRETURN : ret); } int sys_swapcontext(struct thread *td, struct swapcontext_args *uap) { ucontext_t uc; int ret; if (uap->oucp == NULL || uap->ucp == NULL) ret = EINVAL; else { get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); bzero(uc.__spare__, sizeof(uc.__spare__)); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->oucp, UC_COPY_SIZE); if (ret == 0) { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } } return (ret == 0 ? EJUSTRETURN : ret); } Index: head/sys/kern/kern_cpu.c =================================================================== --- head/sys/kern/kern_cpu.c (revision 326270) +++ head/sys/kern/kern_cpu.c (revision 326271) @@ -1,1067 +1,1069 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2007 Nate Lawson (SDG) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpufreq_if.h" /* * Common CPU frequency glue code. Drivers for specific hardware can * attach this interface to allow users to get/set the CPU frequency. */ /* * Number of levels we can handle. Levels are synthesized from settings * so for M settings and N drivers, there may be M*N levels. */ #define CF_MAX_LEVELS 64 struct cf_saved_freq { struct cf_level level; int priority; SLIST_ENTRY(cf_saved_freq) link; }; struct cpufreq_softc { struct sx lock; struct cf_level curr_level; int curr_priority; SLIST_HEAD(, cf_saved_freq) saved_freq; struct cf_level_lst all_levels; int all_count; int max_mhz; device_t dev; struct sysctl_ctx_list sysctl_ctx; struct task startup_task; struct cf_level *levels_buf; }; struct cf_setting_array { struct cf_setting sets[MAX_SETTINGS]; int count; TAILQ_ENTRY(cf_setting_array) link; }; TAILQ_HEAD(cf_setting_lst, cf_setting_array); #define CF_MTX_INIT(x) sx_init((x), "cpufreq lock") #define CF_MTX_LOCK(x) sx_xlock((x)) #define CF_MTX_UNLOCK(x) sx_xunlock((x)) #define CF_MTX_ASSERT(x) sx_assert((x), SX_XLOCKED) #define CF_DEBUG(msg...) do { \ if (cf_verbose) \ printf("cpufreq: " msg); \ } while (0) static int cpufreq_attach(device_t dev); static void cpufreq_startup_task(void *ctx, int pending); static int cpufreq_detach(device_t dev); static int cf_set_method(device_t dev, const struct cf_level *level, int priority); static int cf_get_method(device_t dev, struct cf_level *level); static int cf_levels_method(device_t dev, struct cf_level *levels, int *count); static int cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets, int count); static int cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr); static struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup, struct cf_setting *set); static int cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS); static int cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS); static int cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS); static device_method_t cpufreq_methods[] = { DEVMETHOD(device_probe, bus_generic_probe), DEVMETHOD(device_attach, cpufreq_attach), DEVMETHOD(device_detach, cpufreq_detach), DEVMETHOD(cpufreq_set, cf_set_method), DEVMETHOD(cpufreq_get, cf_get_method), DEVMETHOD(cpufreq_levels, cf_levels_method), {0, 0} }; static driver_t cpufreq_driver = { "cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc) }; static devclass_t cpufreq_dc; DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0); static int cf_lowest_freq; static int cf_verbose; static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL, "cpufreq debugging"); SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RWTUN, &cf_lowest_freq, 1, "Don't provide levels below this frequency."); SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RWTUN, &cf_verbose, 1, "Print verbose debugging messages"); static int cpufreq_attach(device_t dev) { struct cpufreq_softc *sc; struct pcpu *pc; device_t parent; uint64_t rate; int numdevs; CF_DEBUG("initializing %s\n", device_get_nameunit(dev)); sc = device_get_softc(dev); parent = device_get_parent(dev); sc->dev = dev; sysctl_ctx_init(&sc->sysctl_ctx); TAILQ_INIT(&sc->all_levels); CF_MTX_INIT(&sc->lock); sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN; SLIST_INIT(&sc->saved_freq); /* Try to get nominal CPU freq to use it as maximum later if needed */ sc->max_mhz = cpu_get_nominal_mhz(dev); /* If that fails, try to measure the current rate */ if (sc->max_mhz <= 0) { pc = cpu_get_pcpu(dev); if (cpu_est_clockrate(pc->pc_cpuid, &rate) == 0) sc->max_mhz = rate / 1000000; else sc->max_mhz = CPUFREQ_VAL_UNKNOWN; } /* * Only initialize one set of sysctls for all CPUs. In the future, * if multiple CPUs can have different settings, we can move these * sysctls to be under every CPU instead of just the first one. */ numdevs = devclass_get_count(cpufreq_dc); if (numdevs > 1) return (0); CF_DEBUG("initializing one-time data for %s\n", device_get_nameunit(dev)); sc->levels_buf = malloc(CF_MAX_LEVELS * sizeof(*sc->levels_buf), M_DEVBUF, M_WAITOK); SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(parent)), OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0, cpufreq_curr_sysctl, "I", "Current CPU frequency"); SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(parent)), OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, cpufreq_levels_sysctl, "A", "CPU frequency levels"); /* * Queue a one-shot broadcast that levels have changed. * It will run once the system has completed booting. */ TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev); taskqueue_enqueue(taskqueue_thread, &sc->startup_task); return (0); } /* Handle any work to be done for all drivers that attached during boot. */ static void cpufreq_startup_task(void *ctx, int pending) { cpufreq_settings_changed((device_t)ctx); } static int cpufreq_detach(device_t dev) { struct cpufreq_softc *sc; struct cf_saved_freq *saved_freq; int numdevs; CF_DEBUG("shutdown %s\n", device_get_nameunit(dev)); sc = device_get_softc(dev); sysctl_ctx_free(&sc->sysctl_ctx); while ((saved_freq = SLIST_FIRST(&sc->saved_freq)) != NULL) { SLIST_REMOVE_HEAD(&sc->saved_freq, link); free(saved_freq, M_TEMP); } /* Only clean up these resources when the last device is detaching. */ numdevs = devclass_get_count(cpufreq_dc); if (numdevs == 1) { CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev)); free(sc->levels_buf, M_DEVBUF); } return (0); } static int cf_set_method(device_t dev, const struct cf_level *level, int priority) { struct cpufreq_softc *sc; const struct cf_setting *set; struct cf_saved_freq *saved_freq, *curr_freq; struct pcpu *pc; int error, i; sc = device_get_softc(dev); error = 0; set = NULL; saved_freq = NULL; /* We are going to change levels so notify the pre-change handler. */ EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error); if (error != 0) { EVENTHANDLER_INVOKE(cpufreq_post_change, level, error); return (error); } CF_MTX_LOCK(&sc->lock); #ifdef SMP #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); #else /* * If still booting and secondary CPUs not started yet, don't allow * changing the frequency until they're online. This is because we * can't switch to them using sched_bind() and thus we'd only be * switching the main CPU. XXXTODO: Need to think more about how to * handle having different CPUs at different frequencies. */ if (mp_ncpus > 1 && !smp_started) { device_printf(dev, "rejecting change, SMP not started yet\n"); error = ENXIO; goto out; } #endif #endif /* SMP */ /* * If the requested level has a lower priority, don't allow * the new level right now. */ if (priority < sc->curr_priority) { CF_DEBUG("ignoring, curr prio %d less than %d\n", priority, sc->curr_priority); error = EPERM; goto out; } /* * If the caller didn't specify a level and one is saved, prepare to * restore the saved level. If none has been saved, return an error. */ if (level == NULL) { saved_freq = SLIST_FIRST(&sc->saved_freq); if (saved_freq == NULL) { CF_DEBUG("NULL level, no saved level\n"); error = ENXIO; goto out; } level = &saved_freq->level; priority = saved_freq->priority; CF_DEBUG("restoring saved level, freq %d prio %d\n", level->total_set.freq, priority); } /* Reject levels that are below our specified threshold. */ if (level->total_set.freq < cf_lowest_freq) { CF_DEBUG("rejecting freq %d, less than %d limit\n", level->total_set.freq, cf_lowest_freq); error = EINVAL; goto out; } /* If already at this level, just return. */ if (sc->curr_level.total_set.freq == level->total_set.freq) { CF_DEBUG("skipping freq %d, same as current level %d\n", level->total_set.freq, sc->curr_level.total_set.freq); goto skip; } /* First, set the absolute frequency via its driver. */ set = &level->abs_set; if (set->dev) { if (!device_is_attached(set->dev)) { error = ENXIO; goto out; } /* Bind to the target CPU before switching. */ pc = cpu_get_pcpu(set->dev); thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); thread_unlock(curthread); CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq, device_get_nameunit(set->dev), PCPU_GET(cpuid)); error = CPUFREQ_DRV_SET(set->dev, set); thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); if (error) { goto out; } } /* Next, set any/all relative frequencies via their drivers. */ for (i = 0; i < level->rel_count; i++) { set = &level->rel_set[i]; if (!device_is_attached(set->dev)) { error = ENXIO; goto out; } /* Bind to the target CPU before switching. */ pc = cpu_get_pcpu(set->dev); thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); thread_unlock(curthread); CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq, device_get_nameunit(set->dev), PCPU_GET(cpuid)); error = CPUFREQ_DRV_SET(set->dev, set); thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); if (error) { /* XXX Back out any successful setting? */ goto out; } } skip: /* * Before recording the current level, check if we're going to a * higher priority. If so, save the previous level and priority. */ if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN && priority > sc->curr_priority) { CF_DEBUG("saving level, freq %d prio %d\n", sc->curr_level.total_set.freq, sc->curr_priority); curr_freq = malloc(sizeof(*curr_freq), M_TEMP, M_NOWAIT); if (curr_freq == NULL) { error = ENOMEM; goto out; } curr_freq->level = sc->curr_level; curr_freq->priority = sc->curr_priority; SLIST_INSERT_HEAD(&sc->saved_freq, curr_freq, link); } sc->curr_level = *level; sc->curr_priority = priority; /* If we were restoring a saved state, reset it to "unused". */ if (saved_freq != NULL) { CF_DEBUG("resetting saved level\n"); sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN; SLIST_REMOVE_HEAD(&sc->saved_freq, link); free(saved_freq, M_TEMP); } out: CF_MTX_UNLOCK(&sc->lock); /* * We changed levels (or attempted to) so notify the post-change * handler of new frequency or error. */ EVENTHANDLER_INVOKE(cpufreq_post_change, level, error); if (error && set) device_printf(set->dev, "set freq failed, err %d\n", error); return (error); } static int cf_get_method(device_t dev, struct cf_level *level) { struct cpufreq_softc *sc; struct cf_level *levels; struct cf_setting *curr_set, set; struct pcpu *pc; device_t *devs; int bdiff, count, diff, error, i, n, numdevs; uint64_t rate; sc = device_get_softc(dev); error = 0; levels = NULL; /* If we already know the current frequency, we're done. */ CF_MTX_LOCK(&sc->lock); curr_set = &sc->curr_level.total_set; if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) { CF_DEBUG("get returning known freq %d\n", curr_set->freq); goto out; } CF_MTX_UNLOCK(&sc->lock); /* * We need to figure out the current level. Loop through every * driver, getting the current setting. Then, attempt to get a best * match of settings against each level. */ count = CF_MAX_LEVELS; levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT); if (levels == NULL) return (ENOMEM); error = CPUFREQ_LEVELS(sc->dev, levels, &count); if (error) { if (error == E2BIG) printf("cpufreq: need to increase CF_MAX_LEVELS\n"); free(levels, M_TEMP); return (error); } error = device_get_children(device_get_parent(dev), &devs, &numdevs); if (error) { free(levels, M_TEMP); return (error); } /* * Reacquire the lock and search for the given level. * * XXX Note: this is not quite right since we really need to go * through each level and compare both absolute and relative * settings for each driver in the system before making a match. * The estimation code below catches this case though. */ CF_MTX_LOCK(&sc->lock); for (n = 0; n < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; n++) { if (!device_is_attached(devs[n])) continue; if (CPUFREQ_DRV_GET(devs[n], &set) != 0) continue; for (i = 0; i < count; i++) { if (set.freq == levels[i].total_set.freq) { sc->curr_level = levels[i]; break; } } } free(devs, M_TEMP); if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) { CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq); goto out; } /* * We couldn't find an exact match, so attempt to estimate and then * match against a level. */ pc = cpu_get_pcpu(dev); if (pc == NULL) { error = ENXIO; goto out; } cpu_est_clockrate(pc->pc_cpuid, &rate); rate /= 1000000; bdiff = 1 << 30; for (i = 0; i < count; i++) { diff = abs(levels[i].total_set.freq - rate); if (diff < bdiff) { bdiff = diff; sc->curr_level = levels[i]; } } CF_DEBUG("get estimated freq %d\n", curr_set->freq); out: if (error == 0) *level = sc->curr_level; CF_MTX_UNLOCK(&sc->lock); if (levels) free(levels, M_TEMP); return (error); } static int cf_levels_method(device_t dev, struct cf_level *levels, int *count) { struct cf_setting_array *set_arr; struct cf_setting_lst rel_sets; struct cpufreq_softc *sc; struct cf_level *lev; struct cf_setting *sets; struct pcpu *pc; device_t *devs; int error, i, numdevs, set_count, type; uint64_t rate; if (levels == NULL || count == NULL) return (EINVAL); TAILQ_INIT(&rel_sets); sc = device_get_softc(dev); error = device_get_children(device_get_parent(dev), &devs, &numdevs); if (error) return (error); sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT); if (sets == NULL) { free(devs, M_TEMP); return (ENOMEM); } /* Get settings from all cpufreq drivers. */ CF_MTX_LOCK(&sc->lock); for (i = 0; i < numdevs; i++) { /* Skip devices that aren't ready. */ if (!device_is_attached(devs[i])) continue; /* * Get settings, skipping drivers that offer no settings or * provide settings for informational purposes only. */ error = CPUFREQ_DRV_TYPE(devs[i], &type); if (error || (type & CPUFREQ_FLAG_INFO_ONLY)) { if (error == 0) { CF_DEBUG("skipping info-only driver %s\n", device_get_nameunit(devs[i])); } continue; } set_count = MAX_SETTINGS; error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count); if (error || set_count == 0) continue; /* Add the settings to our absolute/relative lists. */ switch (type & CPUFREQ_TYPE_MASK) { case CPUFREQ_TYPE_ABSOLUTE: error = cpufreq_insert_abs(sc, sets, set_count); break; case CPUFREQ_TYPE_RELATIVE: CF_DEBUG("adding %d relative settings\n", set_count); set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT); if (set_arr == NULL) { error = ENOMEM; goto out; } bcopy(sets, set_arr->sets, set_count * sizeof(*sets)); set_arr->count = set_count; TAILQ_INSERT_TAIL(&rel_sets, set_arr, link); break; default: error = EINVAL; } if (error) goto out; } /* * If there are no absolute levels, create a fake one at 100%. We * then cache the clockrate for later use as our base frequency. */ if (TAILQ_EMPTY(&sc->all_levels)) { if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) { sc->max_mhz = cpu_get_nominal_mhz(dev); /* * If the CPU can't report a rate for 100%, hope * the CPU is running at its nominal rate right now, * and use that instead. */ if (sc->max_mhz <= 0) { pc = cpu_get_pcpu(dev); cpu_est_clockrate(pc->pc_cpuid, &rate); sc->max_mhz = rate / 1000000; } } memset(&sets[0], CPUFREQ_VAL_UNKNOWN, sizeof(*sets)); sets[0].freq = sc->max_mhz; sets[0].dev = NULL; error = cpufreq_insert_abs(sc, sets, 1); if (error) goto out; } /* Create a combined list of absolute + relative levels. */ TAILQ_FOREACH(set_arr, &rel_sets, link) cpufreq_expand_set(sc, set_arr); /* If the caller doesn't have enough space, return the actual count. */ if (sc->all_count > *count) { *count = sc->all_count; error = E2BIG; goto out; } /* Finally, output the list of levels. */ i = 0; TAILQ_FOREACH(lev, &sc->all_levels, link) { /* Skip levels that have a frequency that is too low. */ if (lev->total_set.freq < cf_lowest_freq) { sc->all_count--; continue; } levels[i] = *lev; i++; } *count = sc->all_count; error = 0; out: /* Clear all levels since we regenerate them each time. */ while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) { TAILQ_REMOVE(&sc->all_levels, lev, link); free(lev, M_TEMP); } sc->all_count = 0; CF_MTX_UNLOCK(&sc->lock); while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) { TAILQ_REMOVE(&rel_sets, set_arr, link); free(set_arr, M_TEMP); } free(devs, M_TEMP); free(sets, M_TEMP); return (error); } /* * Create levels for an array of absolute settings and insert them in * sorted order in the specified list. */ static int cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets, int count) { struct cf_level_lst *list; struct cf_level *level, *search; int i; CF_MTX_ASSERT(&sc->lock); list = &sc->all_levels; for (i = 0; i < count; i++) { level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO); if (level == NULL) return (ENOMEM); level->abs_set = sets[i]; level->total_set = sets[i]; level->total_set.dev = NULL; sc->all_count++; if (TAILQ_EMPTY(list)) { CF_DEBUG("adding abs setting %d at head\n", sets[i].freq); TAILQ_INSERT_HEAD(list, level, link); continue; } TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) { if (sets[i].freq <= search->total_set.freq) { CF_DEBUG("adding abs setting %d after %d\n", sets[i].freq, search->total_set.freq); TAILQ_INSERT_AFTER(list, search, level, link); break; } } } return (0); } /* * Expand a group of relative settings, creating derived levels from them. */ static int cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr) { struct cf_level *fill, *search; struct cf_setting *set; int i; CF_MTX_ASSERT(&sc->lock); /* * Walk the set of all existing levels in reverse. This is so we * create derived states from the lowest absolute settings first * and discard duplicates created from higher absolute settings. * For instance, a level of 50 Mhz derived from 100 Mhz + 50% is * preferable to 200 Mhz + 25% because absolute settings are more * efficient since they often change the voltage as well. */ TAILQ_FOREACH_REVERSE(search, &sc->all_levels, cf_level_lst, link) { /* Add each setting to the level, duplicating if necessary. */ for (i = 0; i < set_arr->count; i++) { set = &set_arr->sets[i]; /* * If this setting is less than 100%, split the level * into two and add this setting to the new level. */ fill = search; if (set->freq < 10000) { fill = cpufreq_dup_set(sc, search, set); /* * The new level was a duplicate of an existing * level or its absolute setting is too high * so we freed it. For example, we discard a * derived level of 1000 MHz/25% if a level * of 500 MHz/100% already exists. */ if (fill == NULL) break; } /* Add this setting to the existing or new level. */ KASSERT(fill->rel_count < MAX_SETTINGS, ("cpufreq: too many relative drivers (%d)", MAX_SETTINGS)); fill->rel_set[fill->rel_count] = *set; fill->rel_count++; CF_DEBUG( "expand set added rel setting %d%% to %d level\n", set->freq / 100, fill->total_set.freq); } } return (0); } static struct cf_level * cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup, struct cf_setting *set) { struct cf_level_lst *list; struct cf_level *fill, *itr; struct cf_setting *fill_set, *itr_set; int i; CF_MTX_ASSERT(&sc->lock); /* * Create a new level, copy it from the old one, and update the * total frequency and power by the percentage specified in the * relative setting. */ fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT); if (fill == NULL) return (NULL); *fill = *dup; fill_set = &fill->total_set; fill_set->freq = ((uint64_t)fill_set->freq * set->freq) / 10000; if (fill_set->power != CPUFREQ_VAL_UNKNOWN) { fill_set->power = ((uint64_t)fill_set->power * set->freq) / 10000; } if (set->lat != CPUFREQ_VAL_UNKNOWN) { if (fill_set->lat != CPUFREQ_VAL_UNKNOWN) fill_set->lat += set->lat; else fill_set->lat = set->lat; } CF_DEBUG("dup set considering derived setting %d\n", fill_set->freq); /* * If we copied an old level that we already modified (say, at 100%), * we need to remove that setting before adding this one. Since we * process each setting array in order, we know any settings for this * driver will be found at the end. */ for (i = fill->rel_count; i != 0; i--) { if (fill->rel_set[i - 1].dev != set->dev) break; CF_DEBUG("removed last relative driver: %s\n", device_get_nameunit(set->dev)); fill->rel_count--; } /* * Insert the new level in sorted order. If it is a duplicate of an * existing level (1) or has an absolute setting higher than the * existing level (2), do not add it. We can do this since any such * level is guaranteed use less power. For example (1), a level with * one absolute setting of 800 Mhz uses less power than one composed * of an absolute setting of 1600 Mhz and a relative setting at 50%. * Also for example (2), a level of 800 Mhz/75% is preferable to * 1600 Mhz/25% even though the latter has a lower total frequency. */ list = &sc->all_levels; KASSERT(!TAILQ_EMPTY(list), ("all levels list empty in dup set")); TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) { itr_set = &itr->total_set; if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) { CF_DEBUG("dup set rejecting %d (dupe)\n", fill_set->freq); itr = NULL; break; } else if (fill_set->freq < itr_set->freq) { if (fill->abs_set.freq <= itr->abs_set.freq) { CF_DEBUG( "dup done, inserting new level %d after %d\n", fill_set->freq, itr_set->freq); TAILQ_INSERT_AFTER(list, itr, fill, link); sc->all_count++; } else { CF_DEBUG("dup set rejecting %d (abs too big)\n", fill_set->freq); itr = NULL; } break; } } /* We didn't find a good place for this new level so free it. */ if (itr == NULL) { CF_DEBUG("dup set freeing new level %d (not optimal)\n", fill_set->freq); free(fill, M_TEMP); fill = NULL; } return (fill); } static int cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS) { struct cpufreq_softc *sc; struct cf_level *levels; int best, count, diff, bdiff, devcount, error, freq, i, n; device_t *devs; devs = NULL; sc = oidp->oid_arg1; levels = sc->levels_buf; error = CPUFREQ_GET(sc->dev, &levels[0]); if (error) goto out; freq = levels[0].total_set.freq; error = sysctl_handle_int(oidp, &freq, 0, req); if (error != 0 || req->newptr == NULL) goto out; /* * While we only call cpufreq_get() on one device (assuming all * CPUs have equal levels), we call cpufreq_set() on all CPUs. * This is needed for some MP systems. */ error = devclass_get_devices(cpufreq_dc, &devs, &devcount); if (error) goto out; for (n = 0; n < devcount; n++) { count = CF_MAX_LEVELS; error = CPUFREQ_LEVELS(devs[n], levels, &count); if (error) { if (error == E2BIG) printf( "cpufreq: need to increase CF_MAX_LEVELS\n"); break; } best = 0; bdiff = 1 << 30; for (i = 0; i < count; i++) { diff = abs(levels[i].total_set.freq - freq); if (diff < bdiff) { bdiff = diff; best = i; } } error = CPUFREQ_SET(devs[n], &levels[best], CPUFREQ_PRIO_USER); } out: if (devs) free(devs, M_TEMP); return (error); } static int cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS) { struct cpufreq_softc *sc; struct cf_level *levels; struct cf_setting *set; struct sbuf sb; int count, error, i; sc = oidp->oid_arg1; sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND); /* Get settings from the device and generate the output string. */ count = CF_MAX_LEVELS; levels = sc->levels_buf; if (levels == NULL) { sbuf_delete(&sb); return (ENOMEM); } error = CPUFREQ_LEVELS(sc->dev, levels, &count); if (error) { if (error == E2BIG) printf("cpufreq: need to increase CF_MAX_LEVELS\n"); goto out; } if (count) { for (i = 0; i < count; i++) { set = &levels[i].total_set; sbuf_printf(&sb, "%d/%d ", set->freq, set->power); } } else sbuf_cpy(&sb, "0"); sbuf_trim(&sb); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); out: sbuf_delete(&sb); return (error); } static int cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS) { device_t dev; struct cf_setting *sets; struct sbuf sb; int error, i, set_count; dev = oidp->oid_arg1; sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND); /* Get settings from the device and generate the output string. */ set_count = MAX_SETTINGS; sets = malloc(set_count * sizeof(*sets), M_TEMP, M_NOWAIT); if (sets == NULL) { sbuf_delete(&sb); return (ENOMEM); } error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count); if (error) goto out; if (set_count) { for (i = 0; i < set_count; i++) sbuf_printf(&sb, "%d/%d ", sets[i].freq, sets[i].power); } else sbuf_cpy(&sb, "0"); sbuf_trim(&sb); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); out: free(sets, M_TEMP); sbuf_delete(&sb); return (error); } int cpufreq_register(device_t dev) { struct cpufreq_softc *sc; device_t cf_dev, cpu_dev; /* Add a sysctl to get each driver's settings separately. */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "freq_settings", CTLTYPE_STRING | CTLFLAG_RD, dev, 0, cpufreq_settings_sysctl, "A", "CPU frequency driver settings"); /* * Add only one cpufreq device to each CPU. Currently, all CPUs * must offer the same levels and be switched at the same time. */ cpu_dev = device_get_parent(dev); if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) { sc = device_get_softc(cf_dev); sc->max_mhz = CPUFREQ_VAL_UNKNOWN; return (0); } /* Add the child device and possibly sysctls. */ cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", -1); if (cf_dev == NULL) return (ENOMEM); device_quiet(cf_dev); return (device_probe_and_attach(cf_dev)); } int cpufreq_unregister(device_t dev) { device_t cf_dev, *devs; int cfcount, devcount, error, i, type; /* * If this is the last cpufreq child device, remove the control * device as well. We identify cpufreq children by calling a method * they support. */ error = device_get_children(device_get_parent(dev), &devs, &devcount); if (error) return (error); cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1); if (cf_dev == NULL) { device_printf(dev, "warning: cpufreq_unregister called with no cpufreq device active\n"); free(devs, M_TEMP); return (0); } cfcount = 0; for (i = 0; i < devcount; i++) { if (!device_is_attached(devs[i])) continue; if (CPUFREQ_DRV_TYPE(devs[i], &type) == 0) cfcount++; } if (cfcount <= 1) device_delete_child(device_get_parent(cf_dev), cf_dev); free(devs, M_TEMP); return (0); } int cpufreq_settings_changed(device_t dev) { EVENTHANDLER_INVOKE(cpufreq_levels_changed, device_get_unit(device_get_parent(dev))); return (0); } Index: head/sys/kern/kern_cpuset.c =================================================================== --- head/sys/kern/kern_cpuset.c (revision 326270) +++ head/sys/kern/kern_cpuset.c (revision 326271) @@ -1,1349 +1,1351 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008, Jeffrey Roberson * All rights reserved. * * Copyright (c) 2008 Nokia Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ /* * cpusets provide a mechanism for creating and manipulating sets of * processors for the purpose of constraining the scheduling of threads to * specific processors. * * Each process belongs to an identified set, by default this is set 1. Each * thread may further restrict the cpus it may run on to a subset of this * named set. This creates an anonymous set which other threads and processes * may not join by number. * * The named set is referred to herein as the 'base' set to avoid ambiguity. * This set is usually a child of a 'root' set while the anonymous set may * simply be referred to as a mask. In the syscall api these are referred to * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. * * Threads inherit their set from their creator whether it be anonymous or * not. This means that anonymous sets are immutable because they may be * shared. To modify an anonymous set a new set is created with the desired * mask and the same parent as the existing anonymous set. This gives the * illusion of each thread having a private mask. * * Via the syscall apis a user may ask to retrieve or modify the root, base, * or mask that is discovered via a pid, tid, or setid. Modifying a set * modifies all numbered and anonymous child sets to comply with the new mask. * Modifying a pid or tid's mask applies only to that tid but must still * exist within the assigned parent set. * * A thread may not be assigned to a group separate from other threads in * the process. This is to remove ambiguity when the setid is queried with * a pid argument. There is no other technical limitation. * * This somewhat complex arrangement is intended to make it easy for * applications to query available processors and bind their threads to * specific processors while also allowing administrators to dynamically * reprovision by changing sets which apply to groups of processes. * * A simple application should not concern itself with sets at all and * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id * meaning 'curthread'. It may query available cpus for that tid with a * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). */ static uma_zone_t cpuset_zone; static struct mtx cpuset_lock; static struct setlist cpuset_ids; static struct unrhdr *cpuset_unr; static struct cpuset *cpuset_zero, *cpuset_default; /* Return the size of cpuset_t at the kernel level */ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)"); cpuset_t *cpuset_root; cpuset_t cpuset_domain[MAXMEMDOM]; /* * Acquire a reference to a cpuset, all pointers must be tracked with refs. */ struct cpuset * cpuset_ref(struct cpuset *set) { refcount_acquire(&set->cs_ref); return (set); } /* * Walks up the tree from 'set' to find the root. Returns the root * referenced. */ static struct cpuset * cpuset_refroot(struct cpuset *set) { for (; set->cs_parent != NULL; set = set->cs_parent) if (set->cs_flags & CPU_SET_ROOT) break; cpuset_ref(set); return (set); } /* * Find the first non-anonymous set starting from 'set'. Returns this set * referenced. May return the passed in set with an extra ref if it is * not anonymous. */ static struct cpuset * cpuset_refbase(struct cpuset *set) { if (set->cs_id == CPUSET_INVALID) set = set->cs_parent; cpuset_ref(set); return (set); } /* * Release a reference in a context where it is safe to allocate. */ void cpuset_rel(struct cpuset *set) { cpusetid_t id; if (refcount_release(&set->cs_ref) == 0) return; mtx_lock_spin(&cpuset_lock); LIST_REMOVE(set, cs_siblings); id = set->cs_id; if (id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); mtx_unlock_spin(&cpuset_lock); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); if (id != CPUSET_INVALID) free_unr(cpuset_unr, id); } /* * Deferred release must be used when in a context that is not safe to * allocate/free. This places any unreferenced sets on the list 'head'. */ static void cpuset_rel_defer(struct setlist *head, struct cpuset *set) { if (refcount_release(&set->cs_ref) == 0) return; mtx_lock_spin(&cpuset_lock); LIST_REMOVE(set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); LIST_INSERT_HEAD(head, set, cs_link); mtx_unlock_spin(&cpuset_lock); } /* * Complete a deferred release. Removes the set from the list provided to * cpuset_rel_defer. */ static void cpuset_rel_complete(struct cpuset *set) { LIST_REMOVE(set, cs_link); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); } /* * Find a set based on an id. Returns it with a ref. */ static struct cpuset * cpuset_lookup(cpusetid_t setid, struct thread *td) { struct cpuset *set; if (setid == CPUSET_INVALID) return (NULL); mtx_lock_spin(&cpuset_lock); LIST_FOREACH(set, &cpuset_ids, cs_link) if (set->cs_id == setid) break; if (set) cpuset_ref(set); mtx_unlock_spin(&cpuset_lock); KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); if (set != NULL && jailed(td->td_ucred)) { struct cpuset *jset, *tset; jset = td->td_ucred->cr_prison->pr_cpuset; for (tset = set; tset != NULL; tset = tset->cs_parent) if (tset == jset) break; if (tset == NULL) { cpuset_rel(set); set = NULL; } } return (set); } /* * Create a set in the space provided in 'set' with the provided parameters. * The set is returned with a single ref. May return EDEADLK if the set * will have no valid cpu based on restrictions from the parent. */ static int _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask, cpusetid_t id) { if (!CPU_OVERLAP(&parent->cs_mask, mask)) return (EDEADLK); CPU_COPY(mask, &set->cs_mask); LIST_INIT(&set->cs_children); refcount_init(&set->cs_ref, 1); set->cs_flags = 0; mtx_lock_spin(&cpuset_lock); CPU_AND(&set->cs_mask, &parent->cs_mask); set->cs_id = id; set->cs_parent = cpuset_ref(parent); LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); mtx_unlock_spin(&cpuset_lock); return (0); } /* * Create a new non-anonymous set with the requested parent and mask. May * return failures if the mask is invalid or a new number can not be * allocated. */ static int cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) { struct cpuset *set; cpusetid_t id; int error; id = alloc_unr(cpuset_unr); if (id == -1) return (ENFILE); *setp = set = uma_zalloc(cpuset_zone, M_WAITOK); error = _cpuset_create(set, parent, mask, id); if (error == 0) return (0); free_unr(cpuset_unr, id); uma_zfree(cpuset_zone, set); return (error); } /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. */ static int cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask) { struct cpuset *nset; cpuset_t newmask; int error; mtx_assert(&cpuset_lock, MA_OWNED); if (set->cs_flags & CPU_SET_RDONLY) return (EPERM); if (check_mask) { if (!CPU_OVERLAP(&set->cs_mask, mask)) return (EDEADLK); CPU_COPY(&set->cs_mask, &newmask); CPU_AND(&newmask, mask); } else CPU_COPY(mask, &newmask); error = 0; LIST_FOREACH(nset, &set->cs_children, cs_siblings) if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0) break; return (error); } /* * Applies the mask 'mask' without checking for empty sets or permissions. */ static void cpuset_update(struct cpuset *set, cpuset_t *mask) { struct cpuset *nset; mtx_assert(&cpuset_lock, MA_OWNED); CPU_AND(&set->cs_mask, mask); LIST_FOREACH(nset, &set->cs_children, cs_siblings) cpuset_update(nset, &set->cs_mask); return; } /* * Modify the set 'set' to use a copy of the mask provided. Apply this new * mask to restrict all children in the tree. Checks for validity before * applying the changes. */ static int cpuset_modify(struct cpuset *set, cpuset_t *mask) { struct cpuset *root; int error; error = priv_check(curthread, PRIV_SCHED_CPUSET); if (error) return (error); /* * In case we are called from within the jail * we do not allow modifying the dedicated root * cpuset of the jail but may still allow to * change child sets. */ if (jailed(curthread->td_ucred) && set->cs_flags & CPU_SET_ROOT) return (EPERM); /* * Verify that we have access to this set of * cpus. */ root = set->cs_parent; if (root && !CPU_SUBSET(&root->cs_mask, mask)) return (EINVAL); mtx_lock_spin(&cpuset_lock); error = cpuset_testupdate(set, mask, 0); if (error) goto out; CPU_COPY(mask, &set->cs_mask); cpuset_update(set, mask); out: mtx_unlock_spin(&cpuset_lock); return (error); } /* * Resolve the 'which' parameter of several cpuset apis. * * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also * checks for permission via p_cansched(). * * For WHICH_SET returns a valid set with a new reference. * * -1 may be supplied for any argument to mean the current proc/thread or * the base set of the current thread. May fail with ESRCH/EPERM. */ int cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, struct cpuset **setp) { struct cpuset *set; struct thread *td; struct proc *p; int error; *pp = p = NULL; *tdp = td = NULL; *setp = set = NULL; switch (which) { case CPU_WHICH_PID: if (id == -1) { PROC_LOCK(curproc); p = curproc; break; } if ((p = pfind(id)) == NULL) return (ESRCH); break; case CPU_WHICH_TID: if (id == -1) { PROC_LOCK(curproc); p = curproc; td = curthread; break; } td = tdfind(id, -1); if (td == NULL) return (ESRCH); p = td->td_proc; break; case CPU_WHICH_CPUSET: if (id == -1) { thread_lock(curthread); set = cpuset_refbase(curthread->td_cpuset); thread_unlock(curthread); } else set = cpuset_lookup(id, curthread); if (set) { *setp = set; return (0); } return (ESRCH); case CPU_WHICH_JAIL: { /* Find `set' for prison with given id. */ struct prison *pr; sx_slock(&allprison_lock); pr = prison_find_child(curthread->td_ucred->cr_prison, id); sx_sunlock(&allprison_lock); if (pr == NULL) return (ESRCH); cpuset_ref(pr->pr_cpuset); *setp = pr->pr_cpuset; mtx_unlock(&pr->pr_mtx); return (0); } case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (0); default: return (EINVAL); } error = p_cansched(curthread, p); if (error) { PROC_UNLOCK(p); return (error); } if (td == NULL) td = FIRST_THREAD_IN_PROC(p); *pp = p; *tdp = td; return (0); } /* * Create an anonymous set with the provided mask in the space provided by * 'fset'. If the passed in set is anonymous we use its parent otherwise * the new set is a child of 'set'. */ static int cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask) { struct cpuset *parent; if (set->cs_id == CPUSET_INVALID) parent = set->cs_parent; else parent = set; if (!CPU_SUBSET(&parent->cs_mask, mask)) return (EDEADLK); return (_cpuset_create(fset, parent, mask, CPUSET_INVALID)); } /* * Handle two cases for replacing the base set or mask of an entire process. * * 1) Set is non-null and mask is null. This reparents all anonymous sets * to the provided set and replaces all non-anonymous td_cpusets with the * provided set. * 2) Mask is non-null and set is null. This replaces or creates anonymous * sets for every thread with the existing base as a parent. * * This is overly complicated because we can't allocate while holding a * spinlock and spinlocks must be held while changing and examining thread * state. */ static int cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) { struct setlist freelist; struct setlist droplist; struct cpuset *tdset; struct cpuset *nset; struct thread *td; struct proc *p; int threads; int nfree; int error; /* * The algorithm requires two passes due to locking considerations. * * 1) Lookup the process and acquire the locks in the required order. * 2) If enough cpusets have not been allocated release the locks and * allocate them. Loop. */ LIST_INIT(&freelist); LIST_INIT(&droplist); nfree = 0; for (;;) { error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); if (error) goto out; if (nfree >= p->p_numthreads) break; threads = p->p_numthreads; PROC_UNLOCK(p); for (; nfree < threads; nfree++) { nset = uma_zalloc(cpuset_zone, M_WAITOK); LIST_INSERT_HEAD(&freelist, nset, cs_link); } } PROC_LOCK_ASSERT(p, MA_OWNED); /* * Now that the appropriate locks are held and we have enough cpusets, * make sure the operation will succeed before applying changes. The * proc lock prevents td_cpuset from changing between calls. */ error = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); tdset = td->td_cpuset; /* * Verify that a new mask doesn't specify cpus outside of * the set the thread is a member of. */ if (mask) { if (tdset->cs_id == CPUSET_INVALID) tdset = tdset->cs_parent; if (!CPU_SUBSET(&tdset->cs_mask, mask)) error = EDEADLK; /* * Verify that a new set won't leave an existing thread * mask without a cpu to run on. It can, however, restrict * the set. */ } else if (tdset->cs_id == CPUSET_INVALID) { if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask)) error = EDEADLK; } thread_unlock(td); if (error) goto unlock_out; } /* * Replace each thread's cpuset while using deferred release. We * must do this because the thread lock must be held while operating * on the thread and this limits the type of operations allowed. */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); /* * If we presently have an anonymous set or are applying a * mask we must create an anonymous shadow set. That is * either parented to our existing base or the supplied set. * * If we have a base set with no anonymous shadow we simply * replace it outright. */ tdset = td->td_cpuset; if (tdset->cs_id == CPUSET_INVALID || mask) { nset = LIST_FIRST(&freelist); LIST_REMOVE(nset, cs_link); if (mask) error = cpuset_shadow(tdset, nset, mask); else error = _cpuset_create(nset, set, &tdset->cs_mask, CPUSET_INVALID); if (error) { LIST_INSERT_HEAD(&freelist, nset, cs_link); thread_unlock(td); break; } } else nset = cpuset_ref(set); cpuset_rel_defer(&droplist, tdset); td->td_cpuset = nset; sched_affinity(td); thread_unlock(td); } unlock_out: PROC_UNLOCK(p); out: while ((nset = LIST_FIRST(&droplist)) != NULL) cpuset_rel_complete(nset); while ((nset = LIST_FIRST(&freelist)) != NULL) { LIST_REMOVE(nset, cs_link); uma_zfree(cpuset_zone, nset); } return (error); } /* * Return a string representing a valid layout for a cpuset_t object. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ char * cpusetobj_strprint(char *buf, const cpuset_t *set) { char *tbuf; size_t i, bytesp, bufsiz; tbuf = buf; bytesp = 0; bufsiz = CPUSETBUFSIZ; for (i = 0; i < (_NCPUWORDS - 1); i++) { bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]); bufsiz -= bytesp; tbuf += bytesp; } snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]); return (buf); } /* * Build a valid cpuset_t object from a string representation. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ int cpusetobj_strscan(cpuset_t *set, const char *buf) { u_int nwords; int i, ret; if (strlen(buf) > CPUSETBUFSIZ - 1) return (-1); /* Allow to pass a shorter version of the mask when necessary. */ nwords = 1; for (i = 0; buf[i] != '\0'; i++) if (buf[i] == ',') nwords++; if (nwords > _NCPUWORDS) return (-1); CPU_ZERO(set); for (i = 0; i < (nwords - 1); i++) { ret = sscanf(buf, "%lx,", &set->__bits[i]); if (ret == 0 || ret == -1) return (-1); buf = strstr(buf, ","); if (buf == NULL) return (-1); buf++; } ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]); if (ret == 0 || ret == -1) return (-1); return (0); } /* * Apply an anonymous mask to a single thread. */ int cpuset_setthread(lwpid_t id, cpuset_t *mask) { struct cpuset *nset; struct cpuset *set; struct thread *td; struct proc *p; int error; nset = uma_zalloc(cpuset_zone, M_WAITOK); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); if (error) goto out; set = NULL; thread_lock(td); error = cpuset_shadow(td->td_cpuset, nset, mask); if (error == 0) { set = td->td_cpuset; td->td_cpuset = nset; sched_affinity(td); nset = NULL; } thread_unlock(td); PROC_UNLOCK(p); if (set) cpuset_rel(set); out: if (nset) uma_zfree(cpuset_zone, nset); return (error); } /* * Apply new cpumask to the ithread. */ int cpuset_setithread(lwpid_t id, int cpu) { struct cpuset *nset, *rset; struct cpuset *parent, *old_set; struct thread *td; struct proc *p; cpusetid_t cs_id; cpuset_t mask; int error; nset = uma_zalloc(cpuset_zone, M_WAITOK); rset = uma_zalloc(cpuset_zone, M_WAITOK); cs_id = CPUSET_INVALID; CPU_ZERO(&mask); if (cpu == NOCPU) CPU_COPY(cpuset_root, &mask); else CPU_SET(cpu, &mask); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set); if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID)) goto out; /* cpuset_which() returns with PROC_LOCK held. */ old_set = td->td_cpuset; if (cpu == NOCPU) { /* * roll back to default set. We're not using cpuset_shadow() * here because we can fail CPU_SUBSET() check. This can happen * if default set does not contain all CPUs. */ error = _cpuset_create(nset, cpuset_default, &mask, CPUSET_INVALID); goto applyset; } if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID && old_set->cs_parent->cs_id == 1)) { /* * Current set is either default (1) or * shadowed version of default set. * * Allocate new root set to be able to shadow it * with any mask. */ error = _cpuset_create(rset, cpuset_zero, &cpuset_zero->cs_mask, cs_id); if (error != 0) { PROC_UNLOCK(p); goto out; } rset->cs_flags |= CPU_SET_ROOT; parent = rset; rset = NULL; cs_id = CPUSET_INVALID; } else { /* Assume existing set was already allocated by previous call */ parent = old_set; old_set = NULL; } error = cpuset_shadow(parent, nset, &mask); applyset: if (error == 0) { thread_lock(td); td->td_cpuset = nset; sched_affinity(td); thread_unlock(td); nset = NULL; } else old_set = NULL; PROC_UNLOCK(p); if (old_set != NULL) cpuset_rel(old_set); out: if (nset != NULL) uma_zfree(cpuset_zone, nset); if (rset != NULL) uma_zfree(cpuset_zone, rset); if (cs_id != CPUSET_INVALID) free_unr(cpuset_unr, cs_id); return (error); } /* * Creates system-wide cpusets and the cpuset for thread0 including two * sets: * * 0 - The root set which should represent all valid processors in the * system. It is initially created with a mask of all processors * because we don't know what processors are valid until cpuset_init() * runs. This set is immutable. * 1 - The default set which all processes are a member of until changed. * This allows an administrator to move all threads off of given cpus to * dedicate them to high priority tasks or save power etc. */ struct cpuset * cpuset_thread0(void) { struct cpuset *set; int error, i; cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); /* * Create the root system set for the whole machine. Doesn't use * cpuset_create() due to NULL parent. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); CPU_FILL(&set->cs_mask); LIST_INIT(&set->cs_children); LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); set->cs_ref = 1; set->cs_flags = CPU_SET_ROOT; cpuset_zero = set; cpuset_root = &set->cs_mask; /* * Now derive a default, modifiable set from that to give out. */ set = uma_zalloc(cpuset_zone, M_WAITOK); error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1); KASSERT(error == 0, ("Error creating default set: %d\n", error)); cpuset_default = set; /* * Initialize the unit allocator. 0 and 1 are allocated above. */ cpuset_unr = new_unrhdr(2, INT_MAX, NULL); /* * If MD code has not initialized per-domain cpusets, place all * CPUs in domain 0. */ for (i = 0; i < MAXMEMDOM; i++) if (!CPU_EMPTY(&cpuset_domain[i])) goto domains_set; CPU_COPY(&all_cpus, &cpuset_domain[0]); domains_set: return (set); } /* * Create a cpuset, which would be cpuset_create() but * mark the new 'set' as root. * * We are not going to reparent the td to it. Use cpuset_setproc_update_set() * for that. * * In case of no error, returns the set in *setp locked with a reference. */ int cpuset_create_root(struct prison *pr, struct cpuset **setp) { struct cpuset *set; int error; KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); if (error) return (error); KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data", __func__, __LINE__)); /* Mark the set as root. */ set = *setp; set->cs_flags |= CPU_SET_ROOT; return (0); } int cpuset_setproc_update_set(struct proc *p, struct cpuset *set) { int error; KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__)); KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); cpuset_ref(set); error = cpuset_setproc(p->p_pid, set, NULL); if (error) return (error); cpuset_rel(set); return (0); } /* * This is called once the final set of system cpus is known. Modifies * the root set and all children and mark the root read-only. */ static void cpuset_init(void *arg) { cpuset_t mask; mask = all_cpus; if (cpuset_modify(cpuset_zero, &mask)) panic("Can't set initial cpuset mask.\n"); cpuset_zero->cs_flags |= CPU_SET_RDONLY; } SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL); #ifndef _SYS_SYSPROTO_H_ struct cpuset_args { cpusetid_t *setid; }; #endif int sys_cpuset(struct thread *td, struct cpuset_args *uap) { struct cpuset *root; struct cpuset *set; int error; thread_lock(td); root = cpuset_refroot(td->td_cpuset); thread_unlock(td); error = cpuset_create(&set, root, &root->cs_mask); cpuset_rel(root); if (error) return (error); error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); if (error == 0) error = cpuset_setproc(-1, set, NULL); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setid_args { cpuwhich_t which; id_t id; cpusetid_t setid; }; #endif int sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) { return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid)); } int kern_cpuset_setid(struct thread *td, cpuwhich_t which, id_t id, cpusetid_t setid) { struct cpuset *set; int error; /* * Presently we only support per-process sets. */ if (which != CPU_WHICH_PID) return (EINVAL); set = cpuset_lookup(setid, td); if (set == NULL) return (ESRCH); error = cpuset_setproc(id, set, NULL); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getid_args { cpulevel_t level; cpuwhich_t which; id_t id; cpusetid_t *setid; }; #endif int sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) { return (kern_cpuset_getid(td, uap->level, uap->which, uap->id, uap->setid)); } int kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, cpusetid_t *setid) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; cpusetid_t tmpid; int error; if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET) return (EINVAL); error = cpuset_which(which, id, &p, &ttd, &set); if (error) return (error); switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_refbase(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (EINVAL); } switch (level) { case CPU_LEVEL_ROOT: nset = cpuset_refroot(set); cpuset_rel(set); set = nset; break; case CPU_LEVEL_CPUSET: break; case CPU_LEVEL_WHICH: break; } tmpid = set->cs_id; cpuset_rel(set); if (error == 0) error = copyout(&tmpid, setid, sizeof(tmpid)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; cpuset_t *mask; }; #endif int sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) { return (kern_cpuset_getaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask)); } int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp) { struct thread *ttd; struct cpuset *nset; struct cpuset *set; struct proc *p; cpuset_t *mask; int error; size_t size; if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY) return (ERANGE); /* In Capability mode, you can only get your own CPU set. */ if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) return (ECAPMODE); if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) return (ECAPMODE); if (id != -1) return (ECAPMODE); } size = cpusetsize; mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); error = cpuset_which(which, id, &p, &ttd, &set); if (error) goto out; switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); CPU_COPY(&nset->cs_mask, mask); cpuset_rel(nset); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: thread_lock(ttd); CPU_COPY(&ttd->td_cpuset->cs_mask, mask); thread_unlock(ttd); break; case CPU_WHICH_PID: FOREACH_THREAD_IN_PROC(p, ttd) { thread_lock(ttd); CPU_OR(mask, &ttd->td_cpuset->cs_mask); thread_unlock(ttd); } break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: CPU_COPY(&set->cs_mask, mask); break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: error = intr_getaffinity(id, which, mask); break; case CPU_WHICH_DOMAIN: if (id < 0 || id >= MAXMEMDOM) error = ESRCH; else CPU_COPY(&cpuset_domain[id], mask); break; } break; default: error = EINVAL; break; } if (set) cpuset_rel(set); if (p) PROC_UNLOCK(p); if (error == 0) error = copyout(mask, maskp, size); out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; const cpuset_t *mask; }; #endif int sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) { return (kern_cpuset_setaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask)); } int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, const cpuset_t *maskp) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; cpuset_t *mask; int error; if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY) return (ERANGE); /* In Capability mode, you can only set your own CPU set. */ if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) return (ECAPMODE); if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) return (ECAPMODE); if (id != -1) return (ECAPMODE); } mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO); error = copyin(maskp, mask, cpusetsize); if (error) goto out; /* * Verify that no high bits are set. */ if (cpusetsize > sizeof(cpuset_t)) { char *end; char *cp; end = cp = (char *)&mask->__bits; end += cpusetsize; cp += sizeof(cpuset_t); while (cp != end) if (*cp++ != 0) { error = EINVAL; goto out; } } switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: error = cpuset_which(which, id, &p, &ttd, &set); if (error) break; switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); error = cpuset_modify(nset, mask); cpuset_rel(nset); cpuset_rel(set); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: error = cpuset_setthread(id, mask); break; case CPU_WHICH_PID: error = cpuset_setproc(id, NULL, mask); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: error = cpuset_which(which, id, &p, &ttd, &set); if (error == 0) { error = cpuset_modify(set, mask); cpuset_rel(set); } break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: error = intr_setaffinity(id, which, mask); break; default: error = EINVAL; break; } break; default: error = EINVAL; break; } out: free(mask, M_TEMP); return (error); } #ifdef DDB void ddb_display_cpuset(const cpuset_t *set) { int cpu, once; for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) { if (CPU_ISSET(cpu, set)) { if (once == 0) { db_printf("%d", cpu); once = 1; } else db_printf(",%d", cpu); } } if (once == 0) db_printf(""); } DB_SHOW_COMMAND(cpusets, db_show_cpusets) { struct cpuset *set; LIST_FOREACH(set, &cpuset_ids, cs_link) { db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", set, set->cs_id, set->cs_ref, set->cs_flags, (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); db_printf(" mask="); ddb_display_cpuset(&set->cs_mask); db_printf("\n"); if (db_pager_quit) break; } } #endif /* DDB */ Index: head/sys/kern/kern_ctf.c =================================================================== --- head/sys/kern/kern_ctf.c (revision 326270) +++ head/sys/kern/kern_ctf.c (revision 326271) @@ -1,324 +1,326 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 John Birrell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Note this file is included by both link_elf.c and link_elf_obj.c. * * The CTF header structure definition can't be used here because it's * (annoyingly) covered by the CDDL. We will just use a few bytes from * it as an integer array where we 'know' what they mean. */ #define CTF_HDR_SIZE 36 #define CTF_HDR_STRTAB_U32 7 #define CTF_HDR_STRLEN_U32 8 #ifdef DDB_CTF static void * z_alloc(void *nil, u_int items, u_int size) { void *ptr; ptr = malloc(items * size, M_TEMP, M_NOWAIT); return ptr; } static void z_free(void *nil, void *ptr) { free(ptr, M_TEMP); } #endif static int link_elf_ctf_get(linker_file_t lf, linker_ctf_t *lc) { #ifdef DDB_CTF Elf_Ehdr *hdr = NULL; Elf_Shdr *shdr = NULL; caddr_t ctftab = NULL; caddr_t raw = NULL; caddr_t shstrtab = NULL; elf_file_t ef = (elf_file_t) lf; int flags; int i; int nbytes; size_t sz; struct nameidata nd; struct thread *td = curthread; uint8_t ctf_hdr[CTF_HDR_SIZE]; #endif int error = 0; if (lf == NULL || lc == NULL) return (EINVAL); /* Set the defaults for no CTF present. That's not a crime! */ bzero(lc, sizeof(*lc)); #ifdef DDB_CTF /* * First check if we've tried to load CTF data previously and the * CTF ELF section wasn't found. We flag that condition by setting * ctfcnt to -1. See below. */ if (ef->ctfcnt < 0) return (EFTYPE); /* Now check if we've already loaded the CTF data.. */ if (ef->ctfcnt > 0) { /* We only need to load once. */ lc->ctftab = ef->ctftab; lc->ctfcnt = ef->ctfcnt; lc->symtab = ef->ddbsymtab; lc->strtab = ef->ddbstrtab; lc->strcnt = ef->ddbstrcnt; lc->nsym = ef->ddbsymcnt; lc->ctfoffp = (uint32_t **) &ef->ctfoff; lc->typoffp = (uint32_t **) &ef->typoff; lc->typlenp = &ef->typlen; return (0); } /* * We need to try reading the CTF data. Flag no CTF data present * by default and if we actually succeed in reading it, we'll * update ctfcnt to the number of bytes read. */ ef->ctfcnt = -1; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, lf->pathname, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); /* Allocate memory for the FLF header. */ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK); /* Read the ELF header. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* Sanity check. */ if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } /* Allocate memory for all the section headers */ shdr = malloc(nbytes, M_LINKER, M_WAITOK); /* Read all the section headers */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* * We need to search for the CTF section by name, so if the * section names aren't present, then we can't locate the * .SUNW_ctf section containing the CTF data. */ if (hdr->e_shstrndx == 0 || shdr[hdr->e_shstrndx].sh_type != SHT_STRTAB) { printf("%s(%d): module %s e_shstrndx is %d, sh_type is %d\n", __func__, __LINE__, lf->pathname, hdr->e_shstrndx, shdr[hdr->e_shstrndx].sh_type); error = EFTYPE; goto out; } /* Allocate memory to buffer the section header strings. */ shstrtab = malloc(shdr[hdr->e_shstrndx].sh_size, M_LINKER, M_WAITOK); /* Read the section header strings. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, shstrtab, shdr[hdr->e_shstrndx].sh_size, shdr[hdr->e_shstrndx].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* Search for the section containing the CTF data. */ for (i = 0; i < hdr->e_shnum; i++) if (strcmp(".SUNW_ctf", shstrtab + shdr[i].sh_name) == 0) break; /* Check if the CTF section wasn't found. */ if (i >= hdr->e_shnum) { printf("%s(%d): module %s has no .SUNW_ctf section\n", __func__, __LINE__, lf->pathname); error = EFTYPE; goto out; } /* Read the CTF header. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, ctf_hdr, sizeof(ctf_hdr), shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* Check the CTF magic number. (XXX check for big endian!) */ if (ctf_hdr[0] != 0xf1 || ctf_hdr[1] != 0xcf) { printf("%s(%d): module %s has invalid format\n", __func__, __LINE__, lf->pathname); error = EFTYPE; goto out; } /* Check if version 2. */ if (ctf_hdr[2] != 2) { printf("%s(%d): module %s CTF format version is %d " "(2 expected)\n", __func__, __LINE__, lf->pathname, ctf_hdr[2]); error = EFTYPE; goto out; } /* Check if the data is compressed. */ if ((ctf_hdr[3] & 0x1) != 0) { uint32_t *u32 = (uint32_t *) ctf_hdr; /* * The last two fields in the CTF header are the offset * from the end of the header to the start of the string * data and the length of that string data. se this * information to determine the decompressed CTF data * buffer required. */ sz = u32[CTF_HDR_STRTAB_U32] + u32[CTF_HDR_STRLEN_U32] + sizeof(ctf_hdr); /* * Allocate memory for the compressed CTF data, including * the header (which isn't compressed). */ raw = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); } else { /* * The CTF data is not compressed, so the ELF section * size is the same as the buffer size required. */ sz = shdr[i].sh_size; } /* * Allocate memory to buffer the CTF data in its decompressed * form. */ ctftab = malloc(sz, M_LINKER, M_WAITOK); /* * Read the CTF data into the raw buffer if compressed, or * directly into the CTF buffer otherwise. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, raw == NULL ? ctftab : raw, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* Check if decompression is required. */ if (raw != NULL) { z_stream zs; int ret; /* * The header isn't compressed, so copy that into the * CTF buffer first. */ bcopy(ctf_hdr, ctftab, sizeof(ctf_hdr)); /* Initialise the zlib structure. */ bzero(&zs, sizeof(zs)); zs.zalloc = z_alloc; zs.zfree = z_free; if (inflateInit(&zs) != Z_OK) { error = EIO; goto out; } zs.avail_in = shdr[i].sh_size - sizeof(ctf_hdr); zs.next_in = ((uint8_t *) raw) + sizeof(ctf_hdr); zs.avail_out = sz - sizeof(ctf_hdr); zs.next_out = ((uint8_t *) ctftab) + sizeof(ctf_hdr); ret = inflate(&zs, Z_FINISH); inflateEnd(&zs); if (ret != Z_STREAM_END) { printf("%s(%d): zlib inflate returned %d\n", __func__, __LINE__, ret); error = EIO; goto out; } } /* Got the CTF data! */ ef->ctftab = ctftab; ef->ctfcnt = shdr[i].sh_size; /* We'll retain the memory allocated for the CTF data. */ ctftab = NULL; /* Let the caller use the CTF data read. */ lc->ctftab = ef->ctftab; lc->ctfcnt = ef->ctfcnt; lc->symtab = ef->ddbsymtab; lc->strtab = ef->ddbstrtab; lc->strcnt = ef->ddbstrcnt; lc->nsym = ef->ddbsymcnt; lc->ctfoffp = (uint32_t **) &ef->ctfoff; lc->typoffp = (uint32_t **) &ef->typoff; lc->typlenp = &ef->typlen; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (hdr != NULL) free(hdr, M_LINKER); if (shdr != NULL) free(shdr, M_LINKER); if (shstrtab != NULL) free(shstrtab, M_LINKER); if (ctftab != NULL) free(ctftab, M_LINKER); if (raw != NULL) free(raw, M_LINKER); #else error = EOPNOTSUPP; #endif return (error); } Index: head/sys/kern/kern_dtrace.c =================================================================== --- head/sys/kern/kern_dtrace.c (revision 326270) +++ head/sys/kern/kern_dtrace.c (revision 326271) @@ -1,126 +1,128 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007-2008 John Birrell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_kdb.h" #include #include #include #include #include #include #include #include #include #include #define KDTRACE_PROC_SIZE 64 #define KDTRACE_THREAD_SIZE 256 FEATURE(kdtrace_hooks, "Kernel DTrace hooks which are required to load DTrace kernel modules"); static MALLOC_DEFINE(M_KDTRACE, "kdtrace", "DTrace hooks"); /* Hooks used in the machine-dependent trap handlers. */ dtrace_trap_func_t dtrace_trap_func; dtrace_doubletrap_func_t dtrace_doubletrap_func; dtrace_pid_probe_ptr_t dtrace_pid_probe_ptr; dtrace_return_probe_ptr_t dtrace_return_probe_ptr; systrace_probe_func_t __read_frequently systrace_probe_func; /* Return the DTrace process data size compiled in the kernel hooks. */ size_t kdtrace_proc_size() { return (KDTRACE_PROC_SIZE); } static void kdtrace_proc_ctor(void *arg __unused, struct proc *p) { p->p_dtrace = malloc(KDTRACE_PROC_SIZE, M_KDTRACE, M_WAITOK|M_ZERO); } static void kdtrace_proc_dtor(void *arg __unused, struct proc *p) { if (p->p_dtrace != NULL) { free(p->p_dtrace, M_KDTRACE); p->p_dtrace = NULL; } } /* Return the DTrace thread data size compiled in the kernel hooks. */ size_t kdtrace_thread_size() { return (KDTRACE_THREAD_SIZE); } static void kdtrace_thread_ctor(void *arg __unused, struct thread *td) { td->td_dtrace = malloc(KDTRACE_THREAD_SIZE, M_KDTRACE, M_WAITOK|M_ZERO); } static void kdtrace_thread_dtor(void *arg __unused, struct thread *td) { if (td->td_dtrace != NULL) { free(td->td_dtrace, M_KDTRACE); td->td_dtrace = NULL; } } /* * Initialise the kernel DTrace hooks. */ static void init_dtrace(void *dummy __unused) { EVENTHANDLER_REGISTER(process_ctor, kdtrace_proc_ctor, NULL, EVENTHANDLER_PRI_ANY); EVENTHANDLER_REGISTER(process_dtor, kdtrace_proc_dtor, NULL, EVENTHANDLER_PRI_ANY); EVENTHANDLER_REGISTER(thread_ctor, kdtrace_thread_ctor, NULL, EVENTHANDLER_PRI_ANY); EVENTHANDLER_REGISTER(thread_dtor, kdtrace_thread_dtor, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(kdtrace, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_dtrace, NULL); Index: head/sys/kern/kern_environment.c =================================================================== --- head/sys/kern/kern_environment.c (revision 326270) +++ head/sys/kern/kern_environment.c (revision 326271) @@ -1,712 +1,714 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1998 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The unified bootloader passes us a pointer to a preserved copy of * bootstrap/kernel environment variables. We convert them to a * dynamic array of strings later when the VM subsystem is up. * * We make these available through the kenv(2) syscall for userland * and through kern_getenv()/freeenv() kern_setenv() kern_unsetenv() testenv() for * the kernel. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment"); #define KENV_SIZE 512 /* Maximum number of environment strings */ /* pointer to the static environment */ char *kern_envp; static int env_len; static int env_pos; static char *kernenv_next(char *); /* dynamic environment variables */ char **kenvp; struct mtx kenv_lock; /* * No need to protect this with a mutex since SYSINITS are single threaded. */ int dynamic_kenv = 0; #define KENV_CHECK if (!dynamic_kenv) \ panic("%s: called before SI_SUB_KMEM", __func__) int sys_kenv(td, uap) struct thread *td; struct kenv_args /* { int what; const char *name; char *value; int len; } */ *uap; { char *name, *value, *buffer = NULL; size_t len, done, needed, buflen; int error, i; KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0")); error = 0; if (uap->what == KENV_DUMP) { #ifdef MAC error = mac_kenv_check_dump(td->td_ucred); if (error) return (error); #endif done = needed = 0; buflen = uap->len; if (buflen > KENV_SIZE * (KENV_MNAMELEN + KENV_MVALLEN + 2)) buflen = KENV_SIZE * (KENV_MNAMELEN + KENV_MVALLEN + 2); if (uap->len > 0 && uap->value != NULL) buffer = malloc(buflen, M_TEMP, M_WAITOK|M_ZERO); mtx_lock(&kenv_lock); for (i = 0; kenvp[i] != NULL; i++) { len = strlen(kenvp[i]) + 1; needed += len; len = min(len, buflen - done); /* * If called with a NULL or insufficiently large * buffer, just keep computing the required size. */ if (uap->value != NULL && buffer != NULL && len > 0) { bcopy(kenvp[i], buffer + done, len); done += len; } } mtx_unlock(&kenv_lock); if (buffer != NULL) { error = copyout(buffer, uap->value, done); free(buffer, M_TEMP); } td->td_retval[0] = ((done == needed) ? 0 : needed); return (error); } switch (uap->what) { case KENV_SET: error = priv_check(td, PRIV_KENV_SET); if (error) return (error); break; case KENV_UNSET: error = priv_check(td, PRIV_KENV_UNSET); if (error) return (error); break; } name = malloc(KENV_MNAMELEN + 1, M_TEMP, M_WAITOK); error = copyinstr(uap->name, name, KENV_MNAMELEN + 1, NULL); if (error) goto done; switch (uap->what) { case KENV_GET: #ifdef MAC error = mac_kenv_check_get(td->td_ucred, name); if (error) goto done; #endif value = kern_getenv(name); if (value == NULL) { error = ENOENT; goto done; } len = strlen(value) + 1; if (len > uap->len) len = uap->len; error = copyout(value, uap->value, len); freeenv(value); if (error) goto done; td->td_retval[0] = len; break; case KENV_SET: len = uap->len; if (len < 1) { error = EINVAL; goto done; } if (len > KENV_MVALLEN + 1) len = KENV_MVALLEN + 1; value = malloc(len, M_TEMP, M_WAITOK); error = copyinstr(uap->value, value, len, NULL); if (error) { free(value, M_TEMP); goto done; } #ifdef MAC error = mac_kenv_check_set(td->td_ucred, name, value); if (error == 0) #endif kern_setenv(name, value); free(value, M_TEMP); break; case KENV_UNSET: #ifdef MAC error = mac_kenv_check_unset(td->td_ucred, name); if (error) goto done; #endif error = kern_unsetenv(name); if (error) error = ENOENT; break; default: error = EINVAL; break; } done: free(name, M_TEMP); return (error); } /* * Populate the initial kernel environment. * * This is called very early in MD startup, either to provide a copy of the * environment obtained from a boot loader, or to provide an empty buffer into * which MD code can store an initial environment using kern_setenv() calls. * * When a copy of an initial environment is passed in, we start by scanning that * env for overrides to the compiled-in envmode and hintmode variables. * * If the global envmode is 1, the environment is initialized from the global * static_env[], regardless of the arguments passed. This implements the env * keyword described in config(5). In this case env_pos is set to env_len, * causing kern_setenv() to return -1 (if len > 0) or panic (if len == 0) until * the dynamic environment is available. The envmode and static_env variables * are defined in env.c which is generated by config(8). * * If len is non-zero, the caller is providing an empty buffer. The caller will * subsequently use kern_setenv() to add up to len bytes of initial environment * before the dynamic environment is available. * * If len is zero, the caller is providing a pre-loaded buffer containing * environment strings. Additional strings cannot be added until the dynamic * environment is available. The memory pointed to must remain stable at least * until sysinit runs init_dynamic_kenv(). If no initial environment is * available from the boot loader, passing a NULL pointer allows the static_env * to be installed if it is configured. */ void init_static_kenv(char *buf, size_t len) { char *cp; for (cp = buf; cp != NULL && cp[0] != '\0'; cp += strlen(cp) + 1) { if (strcmp(cp, "static_env.disabled=1") == 0) envmode = 0; if (strcmp(cp, "static_hints.disabled=1") == 0) hintmode = 0; } if (envmode == 1) { kern_envp = static_env; env_len = len; env_pos = len; } else { kern_envp = buf; env_len = len; env_pos = 0; } } /* * Setup the dynamic kernel environment. */ static void init_dynamic_kenv(void *data __unused) { char *cp, *cpnext; size_t len; int i; kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV, M_WAITOK | M_ZERO); i = 0; if (kern_envp && *kern_envp != '\0') { for (cp = kern_envp; cp != NULL; cp = cpnext) { cpnext = kernenv_next(cp); len = strlen(cp) + 1; if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) { printf( "WARNING: too long kenv string, ignoring %s\n", cp); continue; } if (i < KENV_SIZE) { kenvp[i] = malloc(len, M_KENV, M_WAITOK); strcpy(kenvp[i++], cp); memset(cp, 0, strlen(cp)); } else printf( "WARNING: too many kenv strings, ignoring %s\n", cp); } } kenvp[i] = NULL; mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF); dynamic_kenv = 1; } SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL); void freeenv(char *env) { if (dynamic_kenv && env != NULL) { memset(env, 0, strlen(env)); free(env, M_KENV); } } /* * Internal functions for string lookup. */ static char * _getenv_dynamic(const char *name, int *idx) { char *cp; int len, i; mtx_assert(&kenv_lock, MA_OWNED); len = strlen(name); for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) { if ((strncmp(cp, name, len) == 0) && (cp[len] == '=')) { if (idx != NULL) *idx = i; return (cp + len + 1); } } return (NULL); } static char * _getenv_static(const char *name) { char *cp, *ep; int len; for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) { for (ep = cp; (*ep != '=') && (*ep != 0); ep++) ; if (*ep != '=') continue; len = ep - cp; ep++; if (!strncmp(name, cp, len) && name[len] == 0) return (ep); } return (NULL); } /* * Look up an environment variable by name. * Return a pointer to the string if found. * The pointer has to be freed with freeenv() * after use. */ char * kern_getenv(const char *name) { char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1]; char *ret; if (dynamic_kenv) { if (getenv_string(name, buf, sizeof(buf))) { ret = strdup(buf, M_KENV); } else { ret = NULL; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "getenv"); } } else ret = _getenv_static(name); return (ret); } /* * Test if an environment variable is defined. */ int testenv(const char *name) { char *cp; if (dynamic_kenv) { mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, NULL); mtx_unlock(&kenv_lock); } else cp = _getenv_static(name); if (cp != NULL) return (1); return (0); } static int setenv_static(const char *name, const char *value) { int len; if (env_pos >= env_len) return (-1); /* Check space for x=y and two nuls */ len = strlen(name) + strlen(value); if (len + 3 < env_len - env_pos) { len = sprintf(&kern_envp[env_pos], "%s=%s", name, value); env_pos += len+1; kern_envp[env_pos] = '\0'; return (0); } else return (-1); } /* * Set an environment variable by name. */ int kern_setenv(const char *name, const char *value) { char *buf, *cp, *oldenv; int namelen, vallen, i; if (dynamic_kenv == 0 && env_len > 0) return (setenv_static(name, value)); KENV_CHECK; namelen = strlen(name) + 1; if (namelen > KENV_MNAMELEN + 1) return (-1); vallen = strlen(value) + 1; if (vallen > KENV_MVALLEN + 1) return (-1); buf = malloc(namelen + vallen, M_KENV, M_WAITOK); sprintf(buf, "%s=%s", name, value); mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, &i); if (cp != NULL) { oldenv = kenvp[i]; kenvp[i] = buf; mtx_unlock(&kenv_lock); free(oldenv, M_KENV); } else { /* We add the option if it wasn't found */ for (i = 0; (cp = kenvp[i]) != NULL; i++) ; /* Bounds checking */ if (i < 0 || i >= KENV_SIZE) { free(buf, M_KENV); mtx_unlock(&kenv_lock); return (-1); } kenvp[i] = buf; kenvp[i + 1] = NULL; mtx_unlock(&kenv_lock); } return (0); } /* * Unset an environment variable string. */ int kern_unsetenv(const char *name) { char *cp, *oldenv; int i, j; KENV_CHECK; mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, &i); if (cp != NULL) { oldenv = kenvp[i]; for (j = i + 1; kenvp[j] != NULL; j++) kenvp[i++] = kenvp[j]; kenvp[i] = NULL; mtx_unlock(&kenv_lock); memset(oldenv, 0, strlen(oldenv)); free(oldenv, M_KENV); return (0); } mtx_unlock(&kenv_lock); return (-1); } /* * Return a string value from an environment variable. */ int getenv_string(const char *name, char *data, int size) { char *cp; if (dynamic_kenv) { mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, NULL); if (cp != NULL) strlcpy(data, cp, size); mtx_unlock(&kenv_lock); } else { cp = _getenv_static(name); if (cp != NULL) strlcpy(data, cp, size); } return (cp != NULL); } /* * Return an integer value from an environment variable. */ int getenv_int(const char *name, int *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (int) tmp; return (rval); } /* * Return an unsigned integer value from an environment variable. */ int getenv_uint(const char *name, unsigned int *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (unsigned int) tmp; return (rval); } /* * Return an int64_t value from an environment variable. */ int getenv_int64(const char *name, int64_t *data) { quad_t tmp; int64_t rval; rval = getenv_quad(name, &tmp); if (rval) *data = (int64_t) tmp; return (rval); } /* * Return an uint64_t value from an environment variable. */ int getenv_uint64(const char *name, uint64_t *data) { quad_t tmp; uint64_t rval; rval = getenv_quad(name, &tmp); if (rval) *data = (uint64_t) tmp; return (rval); } /* * Return a long value from an environment variable. */ int getenv_long(const char *name, long *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (long) tmp; return (rval); } /* * Return an unsigned long value from an environment variable. */ int getenv_ulong(const char *name, unsigned long *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (unsigned long) tmp; return (rval); } /* * Return a quad_t value from an environment variable. */ int getenv_quad(const char *name, quad_t *data) { char value[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1]; char *vtp; quad_t iv; if (!getenv_string(name, value, sizeof(value))) return (0); iv = strtoq(value, &vtp, 0); if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) return (0); switch (vtp[0]) { case 't': case 'T': iv *= 1024; case 'g': case 'G': iv *= 1024; case 'm': case 'M': iv *= 1024; case 'k': case 'K': iv *= 1024; case '\0': break; default: return (0); } *data = iv; return (1); } /* * Find the next entry after the one which (cp) falls within, return a * pointer to its start or NULL if there are no more. */ static char * kernenv_next(char *cp) { if (cp != NULL) { while (*cp != 0) cp++; cp++; if (*cp == 0) cp = NULL; } return (cp); } void tunable_int_init(void *data) { struct tunable_int *d = (struct tunable_int *)data; TUNABLE_INT_FETCH(d->path, d->var); } void tunable_long_init(void *data) { struct tunable_long *d = (struct tunable_long *)data; TUNABLE_LONG_FETCH(d->path, d->var); } void tunable_ulong_init(void *data) { struct tunable_ulong *d = (struct tunable_ulong *)data; TUNABLE_ULONG_FETCH(d->path, d->var); } void tunable_int64_init(void *data) { struct tunable_int64 *d = (struct tunable_int64 *)data; TUNABLE_INT64_FETCH(d->path, d->var); } void tunable_uint64_init(void *data) { struct tunable_uint64 *d = (struct tunable_uint64 *)data; TUNABLE_UINT64_FETCH(d->path, d->var); } void tunable_quad_init(void *data) { struct tunable_quad *d = (struct tunable_quad *)data; TUNABLE_QUAD_FETCH(d->path, d->var); } void tunable_str_init(void *data) { struct tunable_str *d = (struct tunable_str *)data; TUNABLE_STR_FETCH(d->path, d->var, d->size); } Index: head/sys/kern/kern_et.c =================================================================== --- head/sys/kern/kern_et.c (revision 326270) +++ head/sys/kern/kern_et.c (revision 326271) @@ -1,265 +1,267 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010-2013 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "opt_timer.h" SLIST_HEAD(et_eventtimers_list, eventtimer); static struct et_eventtimers_list eventtimers = SLIST_HEAD_INITIALIZER(et_eventtimers); struct mtx et_eventtimers_mtx; MTX_SYSINIT(et_eventtimers_init, &et_eventtimers_mtx, "et_mtx", MTX_DEF); SYSCTL_NODE(_kern, OID_AUTO, eventtimer, CTLFLAG_RW, 0, "Event timers"); static SYSCTL_NODE(_kern_eventtimer, OID_AUTO, et, CTLFLAG_RW, 0, ""); /* * Register a new event timer hardware. */ int et_register(struct eventtimer *et) { struct eventtimer *tmp, *next; if (et->et_quality >= 0 || bootverbose) { if (et->et_frequency == 0) { printf("Event timer \"%s\" quality %d\n", et->et_name, et->et_quality); } else { printf("Event timer \"%s\" " "frequency %ju Hz quality %d\n", et->et_name, (uintmax_t)et->et_frequency, et->et_quality); } } KASSERT(et->et_start, ("et_register: timer has no start function")); et->et_sysctl = SYSCTL_ADD_NODE_WITH_LABEL(NULL, SYSCTL_STATIC_CHILDREN(_kern_eventtimer_et), OID_AUTO, et->et_name, CTLFLAG_RW, 0, "event timer description", "eventtimer"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO, "flags", CTLFLAG_RD, &(et->et_flags), 0, "Event timer capabilities"); SYSCTL_ADD_UQUAD(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO, "frequency", CTLFLAG_RD, &(et->et_frequency), "Event timer base frequency"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO, "quality", CTLFLAG_RD, &(et->et_quality), 0, "Goodness of event timer"); ET_LOCK(); if (SLIST_EMPTY(&eventtimers) || SLIST_FIRST(&eventtimers)->et_quality < et->et_quality) { SLIST_INSERT_HEAD(&eventtimers, et, et_all); } else { SLIST_FOREACH(tmp, &eventtimers, et_all) { next = SLIST_NEXT(tmp, et_all); if (next == NULL || next->et_quality < et->et_quality) { SLIST_INSERT_AFTER(tmp, et, et_all); break; } } } ET_UNLOCK(); return (0); } /* * Deregister event timer hardware. */ int et_deregister(struct eventtimer *et) { int err = 0; if (et->et_deregister_cb != NULL) { if ((err = et->et_deregister_cb(et, et->et_arg)) != 0) return (err); } ET_LOCK(); SLIST_REMOVE(&eventtimers, et, eventtimer, et_all); ET_UNLOCK(); sysctl_remove_oid(et->et_sysctl, 1, 1); return (0); } /* * Change the frequency of the given timer. If it is the active timer, * reconfigure it on all CPUs (reschedules all current events based on the new * timer frequency). */ void et_change_frequency(struct eventtimer *et, uint64_t newfreq) { #ifndef NO_EVENTTIMERS cpu_et_frequency(et, newfreq); #endif } /* * Find free event timer hardware with specified parameters. */ struct eventtimer * et_find(const char *name, int check, int want) { struct eventtimer *et = NULL; SLIST_FOREACH(et, &eventtimers, et_all) { if (et->et_active) continue; if (name != NULL && strcasecmp(et->et_name, name) != 0) continue; if (name == NULL && et->et_quality < 0) continue; if ((et->et_flags & check) != want) continue; break; } return (et); } /* * Initialize event timer hardware. Set callbacks. */ int et_init(struct eventtimer *et, et_event_cb_t *event, et_deregister_cb_t *deregister, void *arg) { if (event == NULL) return (EINVAL); if (et->et_active) return (EBUSY); et->et_active = 1; et->et_event_cb = event; et->et_deregister_cb = deregister; et->et_arg = arg; return (0); } /* * Start event timer hardware. * first - delay before first tick. * period - period of subsequent periodic ticks. */ int et_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { if (!et->et_active) return (ENXIO); KASSERT(period >= 0, ("et_start: negative period")); KASSERT((et->et_flags & ET_FLAGS_PERIODIC) || period == 0, ("et_start: period specified for oneshot-only timer")); KASSERT((et->et_flags & ET_FLAGS_ONESHOT) || period != 0, ("et_start: period not specified for periodic-only timer")); if (period != 0) { if (period < et->et_min_period) period = et->et_min_period; else if (period > et->et_max_period) period = et->et_max_period; } if (period == 0 || first != 0) { if (first < et->et_min_period) first = et->et_min_period; else if (first > et->et_max_period) first = et->et_max_period; } return (et->et_start(et, first, period)); } /* Stop event timer hardware. */ int et_stop(struct eventtimer *et) { if (!et->et_active) return (ENXIO); if (et->et_stop) return (et->et_stop(et)); return (0); } /* Mark event timer hardware as broken. */ int et_ban(struct eventtimer *et) { et->et_flags &= ~(ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT); return (0); } /* Free event timer hardware. */ int et_free(struct eventtimer *et) { if (!et->et_active) return (ENXIO); et->et_active = 0; return (0); } /* Report list of supported event timer hardware via sysctl. */ static int sysctl_kern_eventtimer_choice(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct eventtimer *et; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); ET_LOCK(); SLIST_FOREACH(et, &eventtimers, et_all) { if (et != SLIST_FIRST(&eventtimers)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s(%d)", et->et_name, et->et_quality); } ET_UNLOCK(); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_choice, "A", "Present event timers"); Index: head/sys/kern/kern_event.c =================================================================== --- head/sys/kern/kern_event.c (revision 326270) +++ head/sys/kern/kern_event.c (revision 326271) @@ -1,2639 +1,2641 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_kqueue.h" #ifdef COMPAT_FREEBSD11 #define _WANT_FREEBSD11_KEVENT #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue_ctx); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static void kqueue_destroy(struct kqueue *kq); static void kqueue_drain(struct kqueue *kq, struct thread *td); static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); struct g_kevent_args; static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name); static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; static struct fileops kqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = kqueue_fill_kinfo, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int waitok); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type); static struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, }; static struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, }; /* XXX - move to kern_proc.c? */ static struct filterops proc_filtops = { .f_isfd = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; static struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, }; static struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static uma_zone_t knote_zone; static unsigned int kq_ncallouts = 0; static unsigned int kq_calloutmax = 4 * 1024; SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not influx ? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while(0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) static struct knlist * kn_list_lock(struct knote *kn) { struct knlist *knl; knl = kn->kn_knlist; if (knl != NULL) knl->kl_lock(knl->kl_lockarg); return (knl); } static void kn_list_unlock(struct knlist *knl) { bool do_free; if (knl == NULL) return; do_free = knl->kl_autodestroy && knlist_empty(knl); knl->kl_unlock(knl->kl_lockarg); if (do_free) { knlist_destroy(knl); free(knl, M_KQUEUE); } } static bool kn_in_flux(struct knote *kn) { return (kn->kn_influx > 0); } static void kn_enter_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx < INT_MAX); kn->kn_influx++; } static bool kn_leave_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx > 0); kn->kn_influx--; return (kn->kn_influx == 0); } #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ knl->kl_assert_locked((knl)->kl_lockarg); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ knl->kl_assert_unlocked((knl)->kl_lockarg); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while(0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #ifndef KN_HASHSIZE #define KN_HASHSIZE 64 /* XXX should be tunable */ #endif #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ extern struct filterops sig_filtops; extern struct filterops fs_filtops; /* * Table for for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { struct filterops *for_fop; int for_nolock; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { { &file_filtops, 1 }, /* EVFILT_READ */ { &file_filtops, 1 }, /* EVFILT_WRITE */ { &null_filtops }, /* EVFILT_AIO */ { &file_filtops, 1 }, /* EVFILT_VNODE */ { &proc_filtops, 1 }, /* EVFILT_PROC */ { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ { &timer_filtops, 1 }, /* EVFILT_TIMER */ { &file_filtops, 1 }, /* EVFILT_PROCDESC */ { &fs_filtops, 1 }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ { &user_filtops, 1 }, /* EVFILT_USER */ { &null_filtops }, /* EVFILT_SENDFILE */ { &file_filtops, 1 }, /* EVFILT_EMPTY */ }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int error; bool exiting, immediate; exiting = immediate = false; if (kn->kn_sfflags & NOTE_EXIT) p = pfind_any(kn->kn_id); else p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if (p->p_flag & P_WEXIT) exiting = true; if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * Internal flag indicating registration done by kernel for the * purposes of getting a NOTE_CHILD notification. */ if (kn->kn_flags & EV_FLAG2) { kn->kn_flags &= ~EV_FLAG2; kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); immediate = true; /* Force immediate activation of child note. */ } /* * Internal flag indicating registration done by kernel (for other than * NOTE_CHILD). */ if (kn->kn_flags & EV_FLAG1) { kn->kn_flags &= ~EV_FLAG1; } knlist_add(p->p_klist, kn, 1); /* * Immediately activate any child notes or, in the case of a zombie * target process, exit notes. The latter is necessary to handle the * case where the target process, e.g. a child, dies before the kevent * is registered. */ if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { knlist_remove(kn->kn_knlist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p; u_int event; p = kn->kn_ptr.p_proc; if (p == NULL) /* already activated, from attach filter */ return (0); /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; kn->kn_ptr.p_proc = NULL; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } /* * Called when the process forked. It mostly does the same as the * knote(), activating all knotes registered to be activated when the * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the * child's pid. */ void knote_fork(struct knlist *list, int pid) { struct kqueue *kq; struct knote *kn; struct kevent kev; int error; if (list == NULL) return; list->kl_lock(list->kl_lockarg); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { KQ_UNLOCK(kq); continue; } /* * The same as knote(), activate the event. */ if ((kn->kn_sfflags & NOTE_TRACK) == 0) { kn->kn_status |= KN_HASKQLOCK; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_HASKQLOCK; KQ_UNLOCK(kq); continue; } /* * The NOTE_TRACK case. In addition to the activation * of the event, we need to register new events to * track the child. Drop the locks in preparation for * the call to kqueue_register(). */ kn_enter_flux(kn); KQ_UNLOCK(kq); list->kl_unlock(list->kl_lockarg); /* * Activate existing knote and register tracking knotes with * new process. * * First register a knote to get just the child notice. This * must be a separate note from a potential NOTE_EXIT * notification since both NOTE_CHILD and NOTE_EXIT are defined * to use the data field (in conflicting ways). */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, 0); if (error) kn->kn_fflags |= NOTE_TRACKERR; /* * Then register another knote to track other potential events * from the new process. */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, 0); if (error) kn->kn_fflags |= NOTE_TRACKERR; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 0); KQ_LOCK(kq); kn_leave_flux(kn); KQ_UNLOCK_FLUX(kq); list->kl_lock(list->kl_lockarg); } list->kl_unlock(list->kl_lockarg); } /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. */ #define NOTE_TIMER_PRECMASK \ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) static sbintime_t timer2sbintime(intptr_t data, int flags) { int64_t secs; /* * Macros for converting to the fractional second portion of an * sbintime_t using 64bit multiplication to improve precision. */ #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) switch (flags & NOTE_TIMER_PRECMASK) { case NOTE_SECONDS: #ifdef __LP64__ if (data > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return ((sbintime_t)data << 32); case NOTE_MSECONDS: /* FALLTHROUGH */ case 0: if (data >= 1000) { secs = data / 1000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | MS_TO_SBT(data % 1000)); } return (MS_TO_SBT(data)); case NOTE_USECONDS: if (data >= 1000000) { secs = data / 1000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000)); } return (US_TO_SBT(data)); case NOTE_NSECONDS: if (data >= 1000000000) { secs = data / 1000000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000000)); } return (NS_TO_SBT(data)); default: break; } return (-1); } struct kq_timer_cb_data { struct callout c; sbintime_t next; /* next timer event fires at */ sbintime_t to; /* precalculated timer period, 0 for abs */ }; static void filt_timerexpire(void *knx) { struct knote *kn; struct kq_timer_cb_data *kc; kn = knx; kn->kn_data++; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ if ((kn->kn_flags & EV_ONESHOT) != 0) return; kc = kn->kn_ptr.p_v; if (kc->to == 0) return; kc->next += kc->to; callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); } /* * data contains amount of time to sleep */ static int filt_timerattach(struct knote *kn) { struct kq_timer_cb_data *kc; struct bintime bt; sbintime_t to, sbt; unsigned int ncallouts; if (kn->kn_sdata < 0) return (EINVAL); if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) kn->kn_sdata = 1; /* Only precision unit are supported in flags so far */ if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) return (EINVAL); to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { getboottimebin(&bt); sbt = bttosbt(bt); to -= sbt; } if (to < 0) return (EINVAL); do { ncallouts = kq_ncallouts; if (ncallouts >= kq_calloutmax) return (ENOMEM); } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1)); if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); callout_init(&kc->c, 1); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { kc->next = to; kc->to = 0; } else { kc->next = to + sbinuptime(); kc->to = to; } callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); return (0); } static void filt_timerdetach(struct knote *kn) { struct kq_timer_cb_data *kc; unsigned int old; kc = kn->kn_ptr.p_v; callout_drain(&kc->c); free(kc, M_KQUEUE); old = atomic_fetchadd_int(&kq_ncallouts, -1); KASSERT(old > 0, ("Number of callouts cannot become negative")); kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } static int filt_userattach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; return (0); } static void filt_userdetach(__unused struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, __unused long hint) { return (kn->kn_hookid); } static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) { u_int ffctrl; switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } } int sys_kqueue(struct thread *td, struct kqueue_args *uap) { return (kern_kqueue(td, 0, NULL)); } static void kqueue_init(struct kqueue *kq) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); } int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; struct ucred *cred; int fd, error; fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); error = falloc_caps(td, &fp, &fd, flags, fcaps); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); kqueue_init(kq); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; return (0); } struct g_kevent_args { int fd; void *changelist; int nchanges; void *eventlist; int nevents; const struct timespec *timeout; }; int sys_kevent(struct thread *td, struct kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent_copyout, .k_copyin = kevent_copyin, .kevent_size = sizeof(struct kevent), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent")); } static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name) { struct timespec ts, *tsp; #ifdef KTRACE struct kevent *eventlist = uap->eventlist; #endif int error; if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist, uap->nchanges, k_ops->kevent_size); #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, k_ops, tsp); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, eventlist, td->td_retval[0], k_ops->kevent_size); #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } #ifdef COMPAT_FREEBSD11 static int kevent11_copyout(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { kev11.ident = kevp->ident; kev11.filter = kevp->filter; kev11.flags = kevp->flags; kev11.fflags = kevp->fflags; kev11.data = kevp->data; kev11.udata = kevp->udata; error = copyout(&kev11, uap->eventlist, sizeof(kev11)); if (error != 0) break; uap->eventlist++; kevp++; } return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent11_copyin(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { error = copyin(uap->changelist, &kev11, sizeof(kev11)); if (error != 0) break; kevp->ident = kev11.ident; kevp->filter = kev11.filter; kevp->flags = kev11.flags; kevp->fflags = kev11.fflags; kevp->data = (uintptr_t)kev11.data; kevp->udata = kev11.udata; bzero(&kevp->ext, sizeof(kevp->ext)); uap->changelist++; kevp++; } return (error); } int freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent11_copyout, .k_copyin = kevent11_copyin, .kevent_size = sizeof(struct kevent_freebsd11), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11")); } #endif int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { cap_rights_t rights; struct file *fp; int error; cap_rights_init(&rights); if (nchanges > 0) cap_rights_set(&rights, CAP_KQUEUE_CHANGE); if (nevents > 0) cap_rights_set(&rights, CAP_KQUEUE_EVENT); error = fget(td, fd, &rights, &fp); if (error != 0) return (error); error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); fdrop(fp, td); return (error); } static int kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; int i, n, nerrors, error; nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) return (error); changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, 1); if (error || (kevp->flags & EV_RECEIPT)) { if (nevents == 0) return (error); kevp->flags = EV_ERROR; kevp->data = error; (void)k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; return (0); } return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); } int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kqueue *kq; int error; error = kqueue_acquire(fp, &kq); if (error != 0) return (error); error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); kqueue_release(kq, 0); return (error); } /* * Performs a kevent() call on a temporarily created kqueue. This can be * used to perform one-shot polling, similar to poll() and select(). */ int kern_kevent_anonymous(struct thread *td, int nevents, struct kevent_copyops *k_ops) { struct kqueue kq = {}; int error; kqueue_init(&kq); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); kqueue_destroy(&kq); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (error); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; if (sysfilt_ops[~filt].for_nolock) return sysfilt_ops[~filt].for_fop; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; if (sysfilt_ops[~filt].for_nolock) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. waitok will * influence if memory allocation should wait. Make sure it is 0 if you * hold any mutexes. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) { struct filterops *fops; struct file *fp; struct knote *kn, *tkn; struct knlist *knl; cap_rights_t rights; int error, filt, event; int haskqglobal, filedesc_unlock; if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) return (EINVAL); fp = NULL; kn = NULL; knl = NULL; error = 0; haskqglobal = 0; filedesc_unlock = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; if (kev->flags & EV_ADD) { /* * Prevent waiting with locks. Non-sleepable * allocation failures are handled in the loop, only * if the spare knote appears to be actually required. */ tkn = knote_alloc(waitok); } else { tkn = NULL; } findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); if (kev->ident > INT_MAX) error = EBADF; else error = fget(td, kev->ident, cap_rights_init(&rights, CAP_EVENT), &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, 0) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, waitok); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * If we add some intelligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } /* * Pre-lock the filedesc before the global * lock mutex, see the comment in * kqueue_close(). */ FILEDESC_XLOCK(td->td_proc->p_fd); filedesc_unlock = 1; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) kqueue_expand(kq, fops, kev->ident, waitok); KQ_LOCK(kq); /* * If possible, find an existing knote to use for this kevent. */ if (kev->filter == EVFILT_PROC && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. */ ; } else if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stabilize. */ if (kn != NULL && kn_in_flux(kn)) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) { FILEDESC_XUNLOCK(td->td_proc->p_fd); filedesc_unlock = 0; } kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); if (fp != NULL) { fdrop(fp, td); fp = NULL; } goto findkn; } /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); kn->kn_status = KN_DETACHED; kn_enter_flux(kn); error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop_detached(kn, td); goto done; } knl = kn_list_lock(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ KQ_UNLOCK(kq); error = ENOENT; goto done; } } if (kev->flags & EV_DELETE) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); goto done; } if (kev->flags & EV_FORCEONESHOT) { kn->kn_flags |= EV_ONESHOT; KNOTE_ACTIVATE(kn, 1); } /* * The user may change some filter values after the initial EV_ADD, * but doing so will not reset any filter which has already been * triggered. */ kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); knl = kn_list_lock(kn); kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } /* * We can get here with kn->kn_knlist == NULL. This can happen when * the initial attach event decides that the event is "completed" * already. i.e. filt_procattach is called on a zombie process. It * will call filt_proc which will remove it from the list, and NULL * kn_knlist. */ done_ev_add: if ((kev->flags & EV_ENABLE) != 0) kn->kn_status &= ~KN_DISABLED; else if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; if ((kn->kn_status & KN_DISABLED) == 0) event = kn->kn_fop->f_event(kn, 0); else event = 0; KQ_LOCK(kq); if (event) kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == KN_ACTIVE) knote_enqueue(kn); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) FILEDESC_XUNLOCK(td->td_proc->p_fd); if (fp != NULL) fdrop(fp, td); knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) return (EBADF); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); return (EBADF); } kq->kq_refcnt++; KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } static void kqueue_schedtask(struct kqueue *kq) { KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. * * Not calling hashinit w/ waitok (proper malloc flag) should be safe. * If kqueue_register is called from a non-fd context, there usually/should * be no locks held. */ static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok) { struct klist *list, *tmp_knhash, *to_free; u_long tmp_knhashmask; int size; int fd; int mflag = waitok ? M_WAITOK : M_NOWAIT; KQ_NOTOWNED(kq); to_free = NULL; if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; list = malloc(size * sizeof(*list), M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if (kq->kq_knlistsize > fd) { to_free = list; list = NULL; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof(*list)); to_free = kq->kq_knlist; kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof(*list), (size - kq->kq_knlistsize) * sizeof(*list)); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask); if (tmp_knhash == NULL) return ENOMEM; KQ_LOCK(kq); if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { to_free = tmp_knhash; } KQ_UNLOCK(kq); } } free(to_free, M_KQUEUE); KQ_NOTOWNED(kq); return 0; } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are in flux. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct knote *kn, *marker; struct knlist *knl; sbintime_t asbt, rsbt; int count, error, haskqglobal, influx, nkev, touch; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; rsbt = 0; if (tsp != NULL) { if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) { error = EINVAL; goto done_nl; } if (timespecisset(tsp)) { if (tsp->tv_sec <= INT32_MAX) { rsbt = tstosbt(*tsp); if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = 0; rsbt >>= tc_precexp; } else asbt = 0; } else asbt = -1; } else asbt = 0; marker = knote_alloc(1); marker->kn_status = KN_MARKER; KQ_LOCK(kq); retry: kevp = keva; if (kq->kq_count == 0) { if (asbt == -1) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", asbt, rsbt, C_ABSOLUTE); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || kn_in_flux(kn)) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT(!kn_in_flux(kn), ("knote %p is unexpectedly in flux", kn)); if ((kn->kn_flags & EV_DROP) == EV_DROP) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked it as in flux. */ knote_drop(kn, td); KQ_LOCK(kq); continue; } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked the knote as being in flux. */ *kevp = kn->kn_kevent; knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); knl = kn_list_lock(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_SCAN); kn_leave_flux(kn); kq->kq_count--; kn_list_unlock(knl); influx = 1; continue; } touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); if (touch) kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); else *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0 && kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } if (kn->kn_flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); influx = 1; } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { influx = 0; KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } static void kqueue_drain(struct kqueue *kq, struct thread *td) { struct knote *kn; int i; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo2", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); } static void kqueue_destroy(struct kqueue *kq) { KASSERT(kq->kq_fdp == NULL, ("kqueue still attached to a file descriptor")); seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; int error; int filedesc_unlock; if ((error = kqueue_acquire(fp, &kq))) return error; kqueue_drain(kq, td); /* * We could be called due to the knote_drop() doing fdrop(), * called from kqueue_register(). In this case the global * lock is owned, and filedesc sx is locked before, to not * take the sleepable lock after non-sleepable. */ fdp = kq->kq_fdp; kq->kq_fdp = NULL; if (!sx_xlocked(FILEDESC_LOCK(fdp))) { FILEDESC_XLOCK(fdp); filedesc_unlock = 1; } else filedesc_unlock = 0; TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); if (filedesc_unlock) FILEDESC_XUNLOCK(fdp); kqueue_destroy(kq); chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); crfree(kq->kq_cred); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static int kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_KQUEUE; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; struct knote *kn, *tkn; int error; if (list == NULL) return; KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and enter influx), we can * eliminate the kqueue scheduling, but this will introduce * four lock/unlock's for each knote to test. Also, marker * would be needed to keep iteration position, since filters * or other threads could remove events. */ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { /* * Do not process the influx notes, except for * the influx coming from the kq unlock in the * kqueue_scan(). In the later case, we do * not interfere with the scan, since the code * fragment in kqueue_scan() locks the knlist, * and cannot proceed until we finished. */ KQ_UNLOCK(kq); } else if ((lockflags & KNF_NOKQLOCK) != 0) { kn_enter_flux(kn); KQ_UNLOCK(kq); error = kn->kn_fop->f_event(kn, hint); KQ_LOCK(kq); kn_leave_flux(kn); if (error) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK_FLUX(kq); } else { kn->kn_status |= KN_HASKQLOCK; if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_HASKQLOCK; KQ_UNLOCK(kq); } } if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p was not detached", kn)); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) == 0, ("knote %p was already detached", kn)); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) kn_list_unlock(knl); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove knote from the specified knlist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return (SLIST_EMPTY(&knl->kl_list)); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static void knlist_mtx_assert_locked(void *arg) { mtx_assert((struct mtx *)arg, MA_OWNED); } static void knlist_mtx_assert_unlocked(void *arg) { mtx_assert((struct mtx *)arg, MA_NOTOWNED); } static void knlist_rw_rlock(void *arg) { rw_rlock((struct rwlock *)arg); } static void knlist_rw_runlock(void *arg) { rw_runlock((struct rwlock *)arg); } static void knlist_rw_assert_locked(void *arg) { rw_assert((struct rwlock *)arg, RA_LOCKED); } static void knlist_rw_assert_unlocked(void *arg) { rw_assert((struct rwlock *)arg, RA_UNLOCKED); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_assert_locked == NULL) knl->kl_assert_locked = knlist_mtx_assert_locked; else knl->kl_assert_locked = kl_assert_locked; if (kl_assert_unlocked == NULL) knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; else knl->kl_assert_unlocked = kl_assert_unlocked; knl->kl_autodestroy = 0; SLIST_INIT(&knl->kl_list); } void knlist_init_mtx(struct knlist *knl, struct mtx *lock) { knlist_init(knl, lock, NULL, NULL, NULL, NULL); } struct knlist * knlist_alloc(struct mtx *lock) { struct knlist *knl; knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); knlist_init_mtx(knl, lock); return (knl); } void knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) { knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, knlist_rw_assert_locked, knlist_rw_assert_unlocked); } void knlist_destroy(struct knlist *knl) { KASSERT(KNLIST_EMPTY(knl), ("destroying knlist %p with knotes on it", knl)); } void knlist_detach(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); knl->kl_autodestroy = 1; if (knlist_empty(knl)) { knlist_destroy(knl); free(knl, M_KQUEUE); } } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop_detached(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= EV_EOF | EV_ONESHOT; KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still in flux knotes remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn_in_flux(kn)) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn_enter_flux(kn); KQ_UNLOCK(kq); influx = 1; knote_drop(kn, td); KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); KQ_OWNED(kq); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return (ENOMEM); list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return (ENOMEM); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return (0); } static void knote_drop(struct knote *kn, struct thread *td) { if ((kn->kn_status & KN_DETACHED) == 0) kn->kn_fop->f_detach(kn); knote_drop_detached(kn, td); } static void knote_drop_detached(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p still attached", kn)); KQ_NOTOWNED(kq); KQ_LOCK(kq); KASSERT(kn->kn_influx == 1, ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); static struct knote * knote_alloc(int waitok) { return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO)); } static void knote_free(struct knote *kn) { uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) { struct kqueue *kq; struct file *fp; cap_rights_t rights; int error; error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp); if (error != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, waitok); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return (error); } Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 326270) +++ head/sys/kern/kern_exec.c (revision 326271) @@ -1,1740 +1,1742 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); EVENTHANDLER_LIST_DECLARE(process_exec); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); post_execve(td, error, oldvmspace); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == EJUSTRETURN) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(p->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); return (do_execve(td, args, mac_p)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; cap_rights_t rights; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp among other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, cap_rights_init(&rights, CAP_FEXECVE), &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Do the best to calculate the full path to the image file. */ if (args->fname != NULL && args->fname[0] == '/') imgp->execpath = args->fname; else { VOP_UNLOCK(imgp->vp, 0); if (vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); } /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i, after, initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_WLOCK(object); #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (ma[0]->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(ma[0]); if (!vm_pager_has_page(object, 0, NULL, &after)) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); return (EIO); } initial_pagein = min(after, VM_INITIAL_PAGEIN); KASSERT(initial_pagein <= object->size, ("%s: initial_pagein %d object->size %ju", __func__, initial_pagein, (uintmax_t )object->size)); for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if (vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL); if (rv != VM_PAGER_OK) { for (i = 0; i < initial_pagein; i++) { vm_page_lock(ma[i]); vm_page_free(ma[i]); vm_page_unlock(ma[i]); } VM_OBJECT_WUNLOCK(object); return (EIO); } vm_page_xunbusy(ma[0]); for (i = 1; i < initial_pagein; i++) vm_page_readahead_finish(ma[i]); } vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_activate(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(struct image_params *imgp) { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* An exec terminates mlockall(MCL_FUTURE). */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long argp, envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ for (;;) { error = fueword(argv++, &argp); if (error == -1) { error = EFAULT; goto err_exit; } if (argp == 0) break; error = copyinstr((void *)(uintptr_t)argp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &envp); if (error == -1) { error = EFAULT; goto err_exit; } if (envp == 0) break; error = copyinstr((void *)(uintptr_t)envp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } int exec_copyin_data_fds(struct thread *td, struct image_args *args, const void *data, size_t datalen, const int *fds, size_t fdslen) { struct filedesc *ofdp; const char *p; int *kfds; int error; memset(args, '\0', sizeof(*args)); ofdp = td->td_proc->p_fd; if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1) return (E2BIG); error = exec_alloc_args(args); if (error != 0) return (error); args->begin_argv = args->buf; args->stringspace = ARG_MAX; if (datalen > 0) { /* * Argument buffer has been provided. Copy it into the * kernel as a single string and add a terminating null * byte. */ error = copyin(data, args->begin_argv, datalen); if (error != 0) goto err_exit; args->begin_argv[datalen] = '\0'; args->endp = args->begin_argv + datalen + 1; args->stringspace -= datalen + 1; /* * Traditional argument counting. Count the number of * null bytes. */ for (p = args->begin_argv; p < args->endp; ++p) if (*p == '\0') ++args->argc; } else { /* No argument buffer provided. */ args->endp = args->begin_argv; } /* There are no environment variables. */ args->begin_envv = args->endp; /* Create new file descriptor table. */ kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); error = copyin(fds, kfds, fdslen * sizeof(int)); if (error != 0) { free(kfds, M_TEMP); goto err_exit; } error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); free(kfds, M_TEMP); if (error != 0) goto err_exit; return (0); err_exit: exec_free_args(args); return (error); } struct exec_args_kva { vm_offset_t addr; u_int gen; SLIST_ENTRY(exec_args_kva) next; }; static DPCPU_DEFINE(struct exec_args_kva *, exec_args_kva); static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; static struct mtx exec_args_kva_mtx; static u_int exec_args_gen; static void exec_prealloc_args_kva(void *arg __unused) { struct exec_args_kva *argkva; u_int i; SLIST_INIT(&exec_args_kva_freelist); mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); for (i = 0; i < exec_map_entries; i++) { argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); argkva->gen = exec_args_gen; SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); } } SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); static vm_offset_t exec_alloc_args_kva(void **cookie) { struct exec_args_kva *argkva; argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_PTR(exec_args_kva)); if (argkva == NULL) { mtx_lock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) (void)mtx_sleep(&exec_args_kva_freelist, &exec_args_kva_mtx, 0, "execkva", 0); SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); mtx_unlock(&exec_args_kva_mtx); } *(struct exec_args_kva **)cookie = argkva; return (argkva->addr); } static void exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) { vm_offset_t base; base = argkva->addr; if (argkva->gen != gen) { vm_map_madvise(exec_map, base, base + exec_map_entry_size, MADV_FREE); argkva->gen = gen; } if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), (uintptr_t)NULL, (uintptr_t)argkva)) { mtx_lock(&exec_args_kva_mtx); SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); wakeup_one(&exec_args_kva_freelist); mtx_unlock(&exec_args_kva_mtx); } } static void exec_free_args_kva(void *cookie) { exec_release_args_kva(cookie, exec_args_gen); } static void exec_args_kva_lowmem(void *arg __unused) { SLIST_HEAD(, exec_args_kva) head; struct exec_args_kva *argkva; u_int gen; int i; gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; /* * Force an madvise of each KVA range. Any currently allocated ranges * will have MADV_FREE applied once they are freed. */ SLIST_INIT(&head); mtx_lock(&exec_args_kva_mtx); SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); mtx_unlock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&head)) != NULL) { SLIST_REMOVE_HEAD(&head, next); exec_release_args_kva(argkva, gen); } CPU_FOREACH(i) { argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); if (argkva != NULL) exec_release_args_kva(argkva, gen); } } EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, EVENTHANDLER_PRI_ANY); /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)exec_alloc_args_kva(&args->bufkva); return (0); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { exec_free_args_kva(args->bufkva); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } if (args->fdp != NULL) fdescfree_remapped(args->fdp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(struct image_params *imgp) { int argc, envc; char **vectp; char *stringp; uintptr_t destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; imgp->execpathp = destp; copyout(imgp->execpath, (void *)destp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = destp; copyout(canary, (void *)destp, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = destp; copyout(pagesizes, (void *)destp, szps); imgp->pagesizeslen = szps; destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(struct image_params *imgp) { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error, writecount; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) return (error); if (writecount != 0) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_fail.c =================================================================== --- head/sys/kern/kern_fail.c (revision 326270) +++ head/sys/kern/kern_fail.c (revision 326271) @@ -1,1124 +1,1126 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009 Isilon Inc http://www.isilon.com/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /** * @file * * fail(9) Facility. * * @ingroup failpoint_private */ /** * @defgroup failpoint fail(9) Facility * * Failpoints allow for injecting fake errors into running code on the fly, * without modifying code or recompiling with flags. Failpoints are always * present, and are very efficient when disabled. Failpoints are described * in man fail(9). */ /** * @defgroup failpoint_private Private fail(9) Implementation functions * * Private implementations for the actual failpoint code. * * @ingroup failpoint */ /** * @addtogroup failpoint_private * @{ */ #include __FBSDID("$FreeBSD$"); #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ILOG_DEFINE_FOR_FILE ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point); #endif static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system"); #define fp_free(ptr) free(ptr, M_FAIL_POINT) #define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags)) #define fs_free(ptr) fp_free(ptr) #define fs_malloc() fp_malloc(sizeof(struct fail_point_setting), \ M_WAITOK | M_ZERO) /** * These define the wchans that are used for sleeping, pausing respectively. * They are chosen arbitrarily but need to be distinct to the failpoint and * the sleep/pause distinction. */ #define FP_SLEEP_CHANNEL(fp) (void*)(fp) #define FP_PAUSE_CHANNEL(fp) __DEVOLATILE(void*, &fp->fp_setting) /** * Don't allow more than this many entries in a fail point set by sysctl. * The 99.99...% case is to have 1 entry. I can't imagine having this many * entries, so it should not limit us. Saves on re-mallocs while holding * a non-sleepable lock. */ #define FP_MAX_ENTRY_COUNT 20 /* Used to drain sbufs to the sysctl output */ int fail_sysctl_drain_func(void *, const char *, int); /* Head of tailq of struct fail_point_entry */ TAILQ_HEAD(fail_point_entry_queue, fail_point_entry); /** * fp entries garbage list; outstanding entries are cleaned up in the * garbage collector */ STAILQ_HEAD(fail_point_setting_garbage, fail_point_setting); static struct fail_point_setting_garbage fp_setting_garbage = STAILQ_HEAD_INITIALIZER(fp_setting_garbage); static struct mtx mtx_garbage_list; MTX_SYSINIT(mtx_garbage_list, &mtx_garbage_list, "fail point garbage mtx", MTX_SPIN); static struct sx sx_fp_set; SX_SYSINIT(sx_fp_set, &sx_fp_set, "fail point set sx"); /** * Failpoint types. * Don't change these without changing fail_type_strings in fail.c. * @ingroup failpoint_private */ enum fail_point_t { FAIL_POINT_OFF, /**< don't fail */ FAIL_POINT_PANIC, /**< panic */ FAIL_POINT_RETURN, /**< return an errorcode */ FAIL_POINT_BREAK, /**< break into the debugger */ FAIL_POINT_PRINT, /**< print a message */ FAIL_POINT_SLEEP, /**< sleep for some msecs */ FAIL_POINT_PAUSE, /**< sleep until failpoint is set to off */ FAIL_POINT_YIELD, /**< yield the cpu */ FAIL_POINT_DELAY, /**< busy wait the cpu */ FAIL_POINT_NUMTYPES, FAIL_POINT_INVALID = -1 }; static struct { const char *name; int nmlen; } fail_type_strings[] = { #define FP_TYPE_NM_LEN(s) { s, sizeof(s) - 1 } [FAIL_POINT_OFF] = FP_TYPE_NM_LEN("off"), [FAIL_POINT_PANIC] = FP_TYPE_NM_LEN("panic"), [FAIL_POINT_RETURN] = FP_TYPE_NM_LEN("return"), [FAIL_POINT_BREAK] = FP_TYPE_NM_LEN("break"), [FAIL_POINT_PRINT] = FP_TYPE_NM_LEN("print"), [FAIL_POINT_SLEEP] = FP_TYPE_NM_LEN("sleep"), [FAIL_POINT_PAUSE] = FP_TYPE_NM_LEN("pause"), [FAIL_POINT_YIELD] = FP_TYPE_NM_LEN("yield"), [FAIL_POINT_DELAY] = FP_TYPE_NM_LEN("delay"), }; #define FE_COUNT_UNTRACKED (INT_MIN) /** * Internal structure tracking a single term of a complete failpoint. * @ingroup failpoint_private */ struct fail_point_entry { volatile bool fe_stale; enum fail_point_t fe_type; /**< type of entry */ int fe_arg; /**< argument to type (e.g. return value) */ int fe_prob; /**< likelihood of firing in millionths */ int32_t fe_count; /**< number of times to fire, -1 means infinite */ pid_t fe_pid; /**< only fail for this process */ struct fail_point *fe_parent; /**< backpointer to fp */ TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry ptr */ }; struct fail_point_setting { STAILQ_ENTRY(fail_point_setting) fs_garbage_link; struct fail_point_entry_queue fp_entry_queue; struct fail_point * fs_parent; struct mtx feq_mtx; /* Gives fail_point_pause something to do. */ }; /** * Defines stating the equivalent of probablilty one (100%) */ enum { PROB_MAX = 1000000, /* probability between zero and this number */ PROB_DIGITS = 6 /* number of zero's in above number */ }; /* Get a ref on an fp's fp_setting */ static inline struct fail_point_setting *fail_point_setting_get_ref( struct fail_point *fp); /* Release a ref on an fp_setting */ static inline void fail_point_setting_release_ref(struct fail_point *fp); /* Allocate and initialize a struct fail_point_setting */ static struct fail_point_setting *fail_point_setting_new(struct fail_point *); /* Free a struct fail_point_setting */ static void fail_point_setting_destroy(struct fail_point_setting *fp_setting); /* Allocate and initialize a struct fail_point_entry */ static struct fail_point_entry *fail_point_entry_new(struct fail_point_setting *); /* Free a struct fail_point_entry */ static void fail_point_entry_destroy(struct fail_point_entry *fp_entry); /* Append fp setting to garbage list */ static inline void fail_point_setting_garbage_append( struct fail_point_setting *fp_setting); /* Swap fp's setting with fp_setting_new */ static inline struct fail_point_setting * fail_point_swap_settings(struct fail_point *fp, struct fail_point_setting *fp_setting_new); /* Free up any zero-ref setting in the garbage queue */ static void fail_point_garbage_collect(void); /* If this fail point's setting are empty, then swap it out to NULL. */ static inline void fail_point_eval_swap_out(struct fail_point *fp, struct fail_point_setting *fp_setting); bool fail_point_is_off(struct fail_point *fp) { bool return_val; struct fail_point_setting *fp_setting; struct fail_point_entry *ent; return_val = true; fp_setting = fail_point_setting_get_ref(fp); if (fp_setting != NULL) { TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { if (!ent->fe_stale) { return_val = false; break; } } } fail_point_setting_release_ref(fp); return (return_val); } /* Allocate and initialize a struct fail_point_setting */ static struct fail_point_setting * fail_point_setting_new(struct fail_point *fp) { struct fail_point_setting *fs_new; fs_new = fs_malloc(); fs_new->fs_parent = fp; TAILQ_INIT(&fs_new->fp_entry_queue); mtx_init(&fs_new->feq_mtx, "fail point entries", NULL, MTX_SPIN); fail_point_setting_garbage_append(fs_new); return (fs_new); } /* Free a struct fail_point_setting */ static void fail_point_setting_destroy(struct fail_point_setting *fp_setting) { struct fail_point_entry *ent; while (!TAILQ_EMPTY(&fp_setting->fp_entry_queue)) { ent = TAILQ_FIRST(&fp_setting->fp_entry_queue); TAILQ_REMOVE(&fp_setting->fp_entry_queue, ent, fe_entries); fail_point_entry_destroy(ent); } fs_free(fp_setting); } /* Allocate and initialize a struct fail_point_entry */ static struct fail_point_entry * fail_point_entry_new(struct fail_point_setting *fp_setting) { struct fail_point_entry *fp_entry; fp_entry = fp_malloc(sizeof(struct fail_point_entry), M_WAITOK | M_ZERO); fp_entry->fe_parent = fp_setting->fs_parent; fp_entry->fe_prob = PROB_MAX; fp_entry->fe_pid = NO_PID; fp_entry->fe_count = FE_COUNT_UNTRACKED; TAILQ_INSERT_TAIL(&fp_setting->fp_entry_queue, fp_entry, fe_entries); return (fp_entry); } /* Free a struct fail_point_entry */ static void fail_point_entry_destroy(struct fail_point_entry *fp_entry) { fp_free(fp_entry); } /* Get a ref on an fp's fp_setting */ static inline struct fail_point_setting * fail_point_setting_get_ref(struct fail_point *fp) { struct fail_point_setting *fp_setting; /* Invariant: if we have a ref, our pointer to fp_setting is safe */ atomic_add_acq_32(&fp->fp_ref_cnt, 1); fp_setting = fp->fp_setting; return (fp_setting); } /* Release a ref on an fp_setting */ static inline void fail_point_setting_release_ref(struct fail_point *fp) { KASSERT(&fp->fp_ref_cnt > 0, ("Attempting to deref w/no refs")); atomic_subtract_rel_32(&fp->fp_ref_cnt, 1); } /* Append fp entries to fp garbage list */ static inline void fail_point_setting_garbage_append(struct fail_point_setting *fp_setting) { mtx_lock_spin(&mtx_garbage_list); STAILQ_INSERT_TAIL(&fp_setting_garbage, fp_setting, fs_garbage_link); mtx_unlock_spin(&mtx_garbage_list); } /* Swap fp's entries with fp_setting_new */ static struct fail_point_setting * fail_point_swap_settings(struct fail_point *fp, struct fail_point_setting *fp_setting_new) { struct fail_point_setting *fp_setting_old; fp_setting_old = fp->fp_setting; fp->fp_setting = fp_setting_new; return (fp_setting_old); } static inline void fail_point_eval_swap_out(struct fail_point *fp, struct fail_point_setting *fp_setting) { /* We may have already been swapped out and replaced; ignore. */ if (fp->fp_setting == fp_setting) fail_point_swap_settings(fp, NULL); } /* Free up any zero-ref entries in the garbage queue */ static void fail_point_garbage_collect(void) { struct fail_point_setting *fs_current, *fs_next; struct fail_point_setting_garbage fp_ents_free_list; /** * We will transfer the entries to free to fp_ents_free_list while holding * the spin mutex, then free it after we drop the lock. This avoids * triggering witness due to sleepable mutexes in the memory * allocator. */ STAILQ_INIT(&fp_ents_free_list); mtx_lock_spin(&mtx_garbage_list); STAILQ_FOREACH_SAFE(fs_current, &fp_setting_garbage, fs_garbage_link, fs_next) { if (fs_current->fs_parent->fp_setting != fs_current && fs_current->fs_parent->fp_ref_cnt == 0) { STAILQ_REMOVE(&fp_setting_garbage, fs_current, fail_point_setting, fs_garbage_link); STAILQ_INSERT_HEAD(&fp_ents_free_list, fs_current, fs_garbage_link); } } mtx_unlock_spin(&mtx_garbage_list); STAILQ_FOREACH_SAFE(fs_current, &fp_ents_free_list, fs_garbage_link, fs_next) fail_point_setting_destroy(fs_current); } /* Drain out all refs from this fail point */ static inline void fail_point_drain(struct fail_point *fp, int expected_ref) { struct fail_point_setting *entries; entries = fail_point_swap_settings(fp, NULL); /** * We have unpaused all threads; so we will wait no longer * than the time taken for the longest remaining sleep, or * the length of time of a long-running code block. */ while (fp->fp_ref_cnt > expected_ref) { wakeup(FP_PAUSE_CHANNEL(fp)); tsleep(&fp, PWAIT, "fail_point_drain", hz / 100); } fail_point_swap_settings(fp, entries); } static inline void fail_point_pause(struct fail_point *fp, enum fail_point_return_code *pret, struct mtx *mtx_sleep) { if (fp->fp_pre_sleep_fn) fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); msleep_spin(FP_PAUSE_CHANNEL(fp), mtx_sleep, "failpt", 0); if (fp->fp_post_sleep_fn) fp->fp_post_sleep_fn(fp->fp_post_sleep_arg); } static inline void fail_point_sleep(struct fail_point *fp, int msecs, enum fail_point_return_code *pret) { int timo; /* Convert from millisecs to ticks, rounding up */ timo = howmany((int64_t)msecs * hz, 1000L); if (timo > 0) { if (!(fp->fp_flags & FAIL_POINT_USE_TIMEOUT_PATH)) { if (fp->fp_pre_sleep_fn) fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); tsleep(FP_SLEEP_CHANNEL(fp), PWAIT, "failpt", timo); if (fp->fp_post_sleep_fn) fp->fp_post_sleep_fn(fp->fp_post_sleep_arg); } else { if (fp->fp_pre_sleep_fn) fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); timeout(fp->fp_post_sleep_fn, fp->fp_post_sleep_arg, timo); *pret = FAIL_POINT_RC_QUEUED; } } } static char *parse_fail_point(struct fail_point_setting *, char *); static char *parse_term(struct fail_point_setting *, char *); static char *parse_number(int *out_units, int *out_decimal, char *); static char *parse_type(struct fail_point_entry *, char *); /** * Initialize a fail_point. The name is formed in a printf-like fashion * from "fmt" and subsequent arguments. This function is generally used * for custom failpoints located at odd places in the sysctl tree, and is * not explicitly needed for standard in-line-declared failpoints. * * @ingroup failpoint */ void fail_point_init(struct fail_point *fp, const char *fmt, ...) { va_list ap; char *name; int n; fp->fp_setting = NULL; fp->fp_flags = 0; /* Figure out the size of the name. */ va_start(ap, fmt); n = vsnprintf(NULL, 0, fmt, ap); va_end(ap); /* Allocate the name and fill it in. */ name = fp_malloc(n + 1, M_WAITOK); if (name != NULL) { va_start(ap, fmt); vsnprintf(name, n + 1, fmt, ap); va_end(ap); } fp->fp_name = name; fp->fp_location = ""; fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME; fp->fp_pre_sleep_fn = NULL; fp->fp_pre_sleep_arg = NULL; fp->fp_post_sleep_fn = NULL; fp->fp_post_sleep_arg = NULL; } /** * Free the resources held by a fail_point, and wake any paused threads. * Thou shalt not allow threads to hit this fail point after you enter this * function, nor shall you call this multiple times for a given fp. * @ingroup failpoint */ void fail_point_destroy(struct fail_point *fp) { fail_point_drain(fp, 0); if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) { fp_free(__DECONST(void *, fp->fp_name)); fp->fp_name = NULL; } fp->fp_flags = 0; sx_xlock(&sx_fp_set); fail_point_garbage_collect(); sx_xunlock(&sx_fp_set); } /** * This does the real work of evaluating a fail point. If the fail point tells * us to return a value, this function returns 1 and fills in 'return_value' * (return_value is allowed to be null). If the fail point tells us to panic, * we never return. Otherwise we just return 0 after doing some work, which * means "keep going". */ enum fail_point_return_code fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) { bool execute = false; struct fail_point_entry *ent; struct fail_point_setting *fp_setting; enum fail_point_return_code ret; int cont; int count; int msecs; int usecs; ret = FAIL_POINT_RC_CONTINUE; cont = 0; /* don't continue by default */ fp_setting = fail_point_setting_get_ref(fp); if (fp_setting == NULL) goto abort; TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { if (ent->fe_stale) continue; if (ent->fe_prob < PROB_MAX && ent->fe_prob < random() % PROB_MAX) continue; if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid) continue; if (ent->fe_count != FE_COUNT_UNTRACKED) { count = ent->fe_count; while (count > 0) { if (atomic_cmpset_32(&ent->fe_count, count, count - 1)) { count--; execute = true; break; } count = ent->fe_count; } if (execute == false) /* We lost the race; consider the entry stale and bail now */ continue; if (count == 0) ent->fe_stale = true; } switch (ent->fe_type) { case FAIL_POINT_PANIC: panic("fail point %s panicking", fp->fp_name); /* NOTREACHED */ case FAIL_POINT_RETURN: if (return_value != NULL) *return_value = ent->fe_arg; ret = FAIL_POINT_RC_RETURN; break; case FAIL_POINT_BREAK: printf("fail point %s breaking to debugger\n", fp->fp_name); breakpoint(); break; case FAIL_POINT_PRINT: printf("fail point %s executing\n", fp->fp_name); cont = ent->fe_arg; break; case FAIL_POINT_SLEEP: msecs = ent->fe_arg; if (msecs) fail_point_sleep(fp, msecs, &ret); break; case FAIL_POINT_PAUSE: /** * Pausing is inherently strange with multiple * entries given our design. That is because some * entries could be unreachable, for instance in cases like: * pause->return. We can never reach the return entry. * The sysctl layer actually truncates all entries after * a pause for this reason. */ mtx_lock_spin(&fp_setting->feq_mtx); fail_point_pause(fp, &ret, &fp_setting->feq_mtx); mtx_unlock_spin(&fp_setting->feq_mtx); break; case FAIL_POINT_YIELD: kern_yield(PRI_UNCHANGED); break; case FAIL_POINT_DELAY: usecs = ent->fe_arg; DELAY(usecs); break; default: break; } if (cont == 0) break; } if (fail_point_is_off(fp)) fail_point_eval_swap_out(fp, fp_setting); abort: fail_point_setting_release_ref(fp); return (ret); } /** * Translate internal fail_point structure into human-readable text. */ static void fail_point_get(struct fail_point *fp, struct sbuf *sb, bool verbose) { struct fail_point_entry *ent; struct fail_point_setting *fp_setting; struct fail_point_entry *fp_entry_cpy; int cnt_sleeping; int idx; int printed_entry_count; cnt_sleeping = 0; idx = 0; printed_entry_count = 0; fp_entry_cpy = fp_malloc(sizeof(struct fail_point_entry) * (FP_MAX_ENTRY_COUNT + 1), M_WAITOK); fp_setting = fail_point_setting_get_ref(fp); if (fp_setting != NULL) { TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { if (ent->fe_stale) continue; KASSERT(printed_entry_count < FP_MAX_ENTRY_COUNT, ("FP entry list larger than allowed")); fp_entry_cpy[printed_entry_count] = *ent; ++printed_entry_count; } } fail_point_setting_release_ref(fp); /* This is our equivalent of a NULL terminator */ fp_entry_cpy[printed_entry_count].fe_type = FAIL_POINT_INVALID; while (idx < printed_entry_count) { ent = &fp_entry_cpy[idx]; ++idx; if (ent->fe_prob < PROB_MAX) { int decimal = ent->fe_prob % (PROB_MAX / 100); int units = ent->fe_prob / (PROB_MAX / 100); sbuf_printf(sb, "%d", units); if (decimal) { int digits = PROB_DIGITS - 2; while (!(decimal % 10)) { digits--; decimal /= 10; } sbuf_printf(sb, ".%0*d", digits, decimal); } sbuf_printf(sb, "%%"); } if (ent->fe_count >= 0) sbuf_printf(sb, "%d*", ent->fe_count); sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name); if (ent->fe_arg) sbuf_printf(sb, "(%d)", ent->fe_arg); if (ent->fe_pid != NO_PID) sbuf_printf(sb, "[pid %d]", ent->fe_pid); if (TAILQ_NEXT(ent, fe_entries)) sbuf_printf(sb, "->"); } if (!printed_entry_count) sbuf_printf(sb, "off"); fp_free(fp_entry_cpy); if (verbose) { #ifdef STACK /* Print number of sleeping threads. queue=0 is the argument * used by msleep when sending our threads to sleep. */ sbuf_printf(sb, "\nsleeping_thread_stacks = {\n"); sleepq_sbuf_print_stacks(sb, FP_SLEEP_CHANNEL(fp), 0, &cnt_sleeping); sbuf_printf(sb, "},\n"); #endif sbuf_printf(sb, "sleeping_thread_count = %d,\n", cnt_sleeping); #ifdef STACK sbuf_printf(sb, "paused_thread_stacks = {\n"); sleepq_sbuf_print_stacks(sb, FP_PAUSE_CHANNEL(fp), 0, &cnt_sleeping); sbuf_printf(sb, "},\n"); #endif sbuf_printf(sb, "paused_thread_count = %d\n", cnt_sleeping); } } /** * Set an internal fail_point structure from a human-readable failpoint string * in a lock-safe manner. */ static int fail_point_set(struct fail_point *fp, char *buf) { struct fail_point_entry *ent, *ent_next; struct fail_point_setting *entries; bool should_wake_paused; bool should_truncate; int error; error = 0; should_wake_paused = false; should_truncate = false; /* Parse new entries. */ /** * ref protects our new malloc'd stuff from being garbage collected * before we link it. */ fail_point_setting_get_ref(fp); entries = fail_point_setting_new(fp); if (parse_fail_point(entries, buf) == NULL) { STAILQ_REMOVE(&fp_setting_garbage, entries, fail_point_setting, fs_garbage_link); fail_point_setting_destroy(entries); error = EINVAL; goto end; } /** * Transfer the entries we are going to keep to a new list. * Get rid of useless zero probability entries, and entries with hit * count 0. * If 'off' is present, and it has no hit count set, then all entries * after it are discarded since they are unreachable. */ TAILQ_FOREACH_SAFE(ent, &entries->fp_entry_queue, fe_entries, ent_next) { if (ent->fe_prob == 0 || ent->fe_count == 0) { printf("Discarding entry which cannot execute %s\n", fail_type_strings[ent->fe_type].name); TAILQ_REMOVE(&entries->fp_entry_queue, ent, fe_entries); fp_free(ent); continue; } else if (should_truncate) { printf("Discarding unreachable entry %s\n", fail_type_strings[ent->fe_type].name); TAILQ_REMOVE(&entries->fp_entry_queue, ent, fe_entries); fp_free(ent); continue; } if (ent->fe_type == FAIL_POINT_OFF) { should_wake_paused = true; if (ent->fe_count == FE_COUNT_UNTRACKED) { should_truncate = true; TAILQ_REMOVE(&entries->fp_entry_queue, ent, fe_entries); fp_free(ent); } } else if (ent->fe_type == FAIL_POINT_PAUSE) { should_truncate = true; } else if (ent->fe_type == FAIL_POINT_SLEEP && (fp->fp_flags & FAIL_POINT_NONSLEEPABLE)) { /** * If this fail point is annotated as being in a * non-sleepable ctx, convert sleep to delay and * convert the msec argument to usecs. */ printf("Sleep call request on fail point in " "non-sleepable context; using delay instead " "of sleep\n"); ent->fe_type = FAIL_POINT_DELAY; ent->fe_arg *= 1000; } } if (TAILQ_EMPTY(&entries->fp_entry_queue)) { entries = fail_point_swap_settings(fp, NULL); if (entries != NULL) wakeup(FP_PAUSE_CHANNEL(fp)); } else { if (should_wake_paused) wakeup(FP_PAUSE_CHANNEL(fp)); fail_point_swap_settings(fp, entries); } end: #ifdef IWARNING if (error) IWARNING("Failed to set %s %s to %s", fp->fp_name, fp->fp_location, buf); else INOTICE("Set %s %s to %s", fp->fp_name, fp->fp_location, buf); #endif /* IWARNING */ fail_point_setting_release_ref(fp); return (error); } #define MAX_FAIL_POINT_BUF 1023 /** * Handle kernel failpoint set/get. */ int fail_point_sysctl(SYSCTL_HANDLER_ARGS) { struct fail_point *fp; char *buf; struct sbuf sb, *sb_check; int error; buf = NULL; error = 0; fp = arg1; sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND); if (sb_check != &sb) return (ENOMEM); sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req); /* Setting */ /** * Lock protects any new entries from being garbage collected before we * can link them to the fail point. */ sx_xlock(&sx_fp_set); if (req->newptr) { if (req->newlen > MAX_FAIL_POINT_BUF) { error = EINVAL; goto out; } buf = fp_malloc(req->newlen + 1, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error) goto out; buf[req->newlen] = '\0'; error = fail_point_set(fp, buf); } fail_point_garbage_collect(); sx_xunlock(&sx_fp_set); /* Retrieving. */ fail_point_get(fp, &sb, false); out: sbuf_finish(&sb); sbuf_delete(&sb); if (buf) fp_free(buf); return (error); } int fail_point_sysctl_status(SYSCTL_HANDLER_ARGS) { struct fail_point *fp; struct sbuf sb, *sb_check; fp = arg1; sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND); if (sb_check != &sb) return (ENOMEM); sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req); /* Retrieving. */ fail_point_get(fp, &sb, true); sbuf_finish(&sb); sbuf_delete(&sb); /** * Lock protects any new entries from being garbage collected before we * can link them to the fail point. */ sx_xlock(&sx_fp_set); fail_point_garbage_collect(); sx_xunlock(&sx_fp_set); return (0); } int fail_sysctl_drain_func(void *sysctl_args, const char *buf, int len) { struct sysctl_req *sa; int error; sa = sysctl_args; error = SYSCTL_OUT(sa, buf, len); if (error == ENOMEM) return (-1); else return (len); } /** * Internal helper function to translate a human-readable failpoint string * into a internally-parsable fail_point structure. */ static char * parse_fail_point(struct fail_point_setting *ents, char *p) { /* :: * ( "->" )* */ uint8_t term_count; term_count = 1; p = parse_term(ents, p); if (p == NULL) return (NULL); while (*p != '\0') { term_count++; if (p[0] != '-' || p[1] != '>' || (p = parse_term(ents, p+2)) == NULL || term_count > FP_MAX_ENTRY_COUNT) return (NULL); } return (p); } /** * Internal helper function to parse an individual term from a failpoint. */ static char * parse_term(struct fail_point_setting *ents, char *p) { struct fail_point_entry *ent; ent = fail_point_entry_new(ents); /* * :: * ( ( "%") | ( "*" ) )* * * [ "(" ")" ] * [ "[pid " "]" ] */ /* ( ( "%") | ( "*" ) )* */ while (isdigit(*p) || *p == '.') { int units, decimal; p = parse_number(&units, &decimal, p); if (p == NULL) return (NULL); if (*p == '%') { if (units > 100) /* prevent overflow early */ units = 100; ent->fe_prob = units * (PROB_MAX / 100) + decimal; if (ent->fe_prob > PROB_MAX) ent->fe_prob = PROB_MAX; } else if (*p == '*') { if (!units || units < 0 || decimal) return (NULL); ent->fe_count = units; } else return (NULL); p++; } /* */ p = parse_type(ent, p); if (p == NULL) return (NULL); if (*p == '\0') return (p); /* [ "(" ")" ] */ if (*p != '(') return (p); p++; if (!isdigit(*p) && *p != '-') return (NULL); ent->fe_arg = strtol(p, &p, 0); if (*p++ != ')') return (NULL); /* [ "[pid " "]" ] */ #define PID_STRING "[pid " if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0) return (p); p += sizeof(PID_STRING) - 1; if (!isdigit(*p)) return (NULL); ent->fe_pid = strtol(p, &p, 0); if (*p++ != ']') return (NULL); return (p); } /** * Internal helper function to parse a numeric for a failpoint term. */ static char * parse_number(int *out_units, int *out_decimal, char *p) { char *old_p; /** * :: * [ "." ] | * "." */ /* whole part */ old_p = p; *out_units = strtol(p, &p, 10); if (p == old_p && *p != '.') return (NULL); /* fractional part */ *out_decimal = 0; if (*p == '.') { int digits = 0; p++; while (isdigit(*p)) { int digit = *p - '0'; if (digits < PROB_DIGITS - 2) *out_decimal = *out_decimal * 10 + digit; else if (digits == PROB_DIGITS - 2 && digit >= 5) (*out_decimal)++; digits++; p++; } if (!digits) /* need at least one digit after '.' */ return (NULL); while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */ *out_decimal *= 10; } return (p); /* success */ } /** * Internal helper function to parse an individual type for a failpoint term. */ static char * parse_type(struct fail_point_entry *ent, char *beg) { enum fail_point_t type; int len; for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) { len = fail_type_strings[type].nmlen; if (strncmp(fail_type_strings[type].name, beg, len) == 0) { ent->fe_type = type; return (beg + len); } } return (NULL); } /* The fail point sysctl tree. */ SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points"); /* Debugging/testing stuff for fail point */ static int sysctl_test_fail_point(SYSCTL_HANDLER_ARGS) { KFAIL_POINT_RETURN(DEBUG_FP, test_fail_point); return (0); } SYSCTL_OID(_debug_fail_point, OID_AUTO, test_trigger_fail_point, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_test_fail_point, "A", "Trigger test fail points"); Index: head/sys/kern/kern_ffclock.c =================================================================== --- head/sys/kern/kern_ffclock.c (revision 326270) +++ head/sys/kern/kern_ffclock.c (revision 326271) @@ -1,482 +1,484 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 The University of Melbourne * All rights reserved. * * This software was developed by Julien Ridoux at the University of Melbourne * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ffclock.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FFCLOCK FEATURE(ffclock, "Feed-forward clock support"); extern struct ffclock_estimate ffclock_estimate; extern struct bintime ffclock_boottime; extern int8_t ffclock_updated; extern struct mtx ffclock_mtx; /* * Feed-forward clock absolute time. This should be the preferred way to read * the feed-forward clock for "wall-clock" type time. The flags allow to compose * various flavours of absolute time (e.g. with or without leap seconds taken * into account). If valid pointers are provided, the ffcounter value and an * upper bound on clock error associated with the bintime are provided. * NOTE: use ffclock_convert_abs() to differ the conversion of a ffcounter value * read earlier. */ void ffclock_abstime(ffcounter *ffcount, struct bintime *bt, struct bintime *error_bound, uint32_t flags) { struct ffclock_estimate cest; ffcounter ffc; ffcounter update_ffcount; ffcounter ffdelta_error; /* Get counter and corresponding time. */ if ((flags & FFCLOCK_FAST) == FFCLOCK_FAST) ffclock_last_tick(&ffc, bt, flags); else { ffclock_read_counter(&ffc); ffclock_convert_abs(ffc, bt, flags); } /* Current ffclock estimate, use update_ffcount as generation number. */ do { update_ffcount = ffclock_estimate.update_ffcount; bcopy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate)); } while (update_ffcount != ffclock_estimate.update_ffcount); /* * Leap second adjustment. Total as seen by synchronisation algorithm * since it started. cest.leapsec_next is the ffcounter prediction of * when the next leapsecond occurs. */ if ((flags & FFCLOCK_LEAPSEC) == FFCLOCK_LEAPSEC) { bt->sec -= cest.leapsec_total; if (ffc > cest.leapsec_next) bt->sec -= cest.leapsec; } /* Boot time adjustment, for uptime/monotonic clocks. */ if ((flags & FFCLOCK_UPTIME) == FFCLOCK_UPTIME) { bintime_sub(bt, &ffclock_boottime); } /* Compute error bound if a valid pointer has been passed. */ if (error_bound) { ffdelta_error = ffc - cest.update_ffcount; ffclock_convert_diff(ffdelta_error, error_bound); /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */ bintime_mul(error_bound, cest.errb_rate * (uint64_t)18446744073709LL); /* 18446744073 = int(2^64 / 1e9), since err_abs in [ns] */ bintime_addx(error_bound, cest.errb_abs * (uint64_t)18446744073LL); } if (ffcount) *ffcount = ffc; } /* * Feed-forward difference clock. This should be the preferred way to convert a * time interval in ffcounter values into a time interval in seconds. If a valid * pointer is passed, an upper bound on the error in computing the time interval * in seconds is provided. */ void ffclock_difftime(ffcounter ffdelta, struct bintime *bt, struct bintime *error_bound) { ffcounter update_ffcount; uint32_t err_rate; ffclock_convert_diff(ffdelta, bt); if (error_bound) { do { update_ffcount = ffclock_estimate.update_ffcount; err_rate = ffclock_estimate.errb_rate; } while (update_ffcount != ffclock_estimate.update_ffcount); ffclock_convert_diff(ffdelta, error_bound); /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */ bintime_mul(error_bound, err_rate * (uint64_t)18446744073709LL); } } /* * Create a new kern.sysclock sysctl node, which will be home to some generic * sysclock configuration variables. Feed-forward clock specific variables will * live under the ffclock subnode. */ SYSCTL_NODE(_kern, OID_AUTO, sysclock, CTLFLAG_RW, 0, "System clock related configuration"); SYSCTL_NODE(_kern_sysclock, OID_AUTO, ffclock, CTLFLAG_RW, 0, "Feed-forward clock configuration"); static char *sysclocks[] = {"feedback", "feed-forward"}; #define MAX_SYSCLOCK_NAME_LEN 16 #define NUM_SYSCLOCKS nitems(sysclocks) static int ffclock_version = 2; SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, version, CTLFLAG_RD, &ffclock_version, 0, "Feed-forward clock kernel version"); /* List available sysclocks. */ static int sysctl_kern_sysclock_available(SYSCTL_HANDLER_ARGS) { struct sbuf *s; int clk, error; s = sbuf_new_for_sysctl(NULL, NULL, MAX_SYSCLOCK_NAME_LEN * NUM_SYSCLOCKS, req); if (s == NULL) return (ENOMEM); for (clk = 0; clk < NUM_SYSCLOCKS; clk++) { sbuf_cat(s, sysclocks[clk]); if (clk + 1 < NUM_SYSCLOCKS) sbuf_cat(s, " "); } error = sbuf_finish(s); sbuf_delete(s); return (error); } SYSCTL_PROC(_kern_sysclock, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, sysctl_kern_sysclock_available, "A", "List of available system clocks"); /* * Return the name of the active system clock if read, or attempt to change * the active system clock to the user specified one if written to. The active * system clock is read when calling any of the [get]{bin,nano,micro}[up]time() * functions. */ static int sysctl_kern_sysclock_active(SYSCTL_HANDLER_ARGS) { char newclock[MAX_SYSCLOCK_NAME_LEN]; int error; int clk; /* Return the name of the current active sysclock. */ strlcpy(newclock, sysclocks[sysclock_active], sizeof(newclock)); error = sysctl_handle_string(oidp, newclock, sizeof(newclock), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) goto done; /* Change the active sysclock to the user specified one: */ error = EINVAL; for (clk = 0; clk < NUM_SYSCLOCKS; clk++) { if (strncmp(newclock, sysclocks[clk], MAX_SYSCLOCK_NAME_LEN - 1)) { continue; } sysclock_active = clk; error = 0; break; } done: return (error); } SYSCTL_PROC(_kern_sysclock, OID_AUTO, active, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_kern_sysclock_active, "A", "Name of the active system clock which is currently serving time"); static int sysctl_kern_ffclock_ffcounter_bypass = 0; SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, ffcounter_bypass, CTLFLAG_RW, &sysctl_kern_ffclock_ffcounter_bypass, 0, "Use reliable hardware timecounter as the feed-forward counter"); /* * High level functions to access the Feed-Forward Clock. */ void ffclock_bintime(struct bintime *bt) { ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC); } void ffclock_nanotime(struct timespec *tsp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC); bintime2timespec(&bt, tsp); } void ffclock_microtime(struct timeval *tvp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC); bintime2timeval(&bt, tvp); } void ffclock_getbintime(struct bintime *bt) { ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST); } void ffclock_getnanotime(struct timespec *tsp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST); bintime2timespec(&bt, tsp); } void ffclock_getmicrotime(struct timeval *tvp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST); bintime2timeval(&bt, tvp); } void ffclock_binuptime(struct bintime *bt) { ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME); } void ffclock_nanouptime(struct timespec *tsp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME); bintime2timespec(&bt, tsp); } void ffclock_microuptime(struct timeval *tvp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME); bintime2timeval(&bt, tvp); } void ffclock_getbinuptime(struct bintime *bt) { ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST); } void ffclock_getnanouptime(struct timespec *tsp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST); bintime2timespec(&bt, tsp); } void ffclock_getmicrouptime(struct timeval *tvp) { struct bintime bt; ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST); bintime2timeval(&bt, tvp); } void ffclock_bindifftime(ffcounter ffdelta, struct bintime *bt) { ffclock_difftime(ffdelta, bt, NULL); } void ffclock_nanodifftime(ffcounter ffdelta, struct timespec *tsp) { struct bintime bt; ffclock_difftime(ffdelta, &bt, NULL); bintime2timespec(&bt, tsp); } void ffclock_microdifftime(ffcounter ffdelta, struct timeval *tvp) { struct bintime bt; ffclock_difftime(ffdelta, &bt, NULL); bintime2timeval(&bt, tvp); } /* * System call allowing userland applications to retrieve the current value of * the Feed-Forward Clock counter. */ #ifndef _SYS_SYSPROTO_H_ struct ffclock_getcounter_args { ffcounter *ffcount; }; #endif /* ARGSUSED */ int sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap) { ffcounter ffcount; int error; ffcount = 0; ffclock_read_counter(&ffcount); if (ffcount == 0) return (EAGAIN); error = copyout(&ffcount, uap->ffcount, sizeof(ffcounter)); return (error); } /* * System call allowing the synchronisation daemon to push new feed-foward clock * estimates to the kernel. Acquire ffclock_mtx to prevent concurrent updates * and ensure data consistency. * NOTE: ffclock_updated signals the fftimehands that new estimates are * available. The updated estimates are picked up by the fftimehands on next * tick, which could take as long as 1/hz seconds (if ticks are not missed). */ #ifndef _SYS_SYSPROTO_H_ struct ffclock_setestimate_args { struct ffclock_estimate *cest; }; #endif /* ARGSUSED */ int sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap) { struct ffclock_estimate cest; int error; /* Reuse of PRIV_CLOCK_SETTIME. */ if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0) return (error); if ((error = copyin(uap->cest, &cest, sizeof(struct ffclock_estimate))) != 0) return (error); mtx_lock(&ffclock_mtx); memcpy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate)); ffclock_updated++; mtx_unlock(&ffclock_mtx); return (error); } /* * System call allowing userland applications to retrieve the clock estimates * stored within the kernel. It is useful to kickstart the synchronisation * daemon with the kernel's knowledge of hardware timecounter. */ #ifndef _SYS_SYSPROTO_H_ struct ffclock_getestimate_args { struct ffclock_estimate *cest; }; #endif /* ARGSUSED */ int sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap) { struct ffclock_estimate cest; int error; mtx_lock(&ffclock_mtx); memcpy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate)); mtx_unlock(&ffclock_mtx); error = copyout(&cest, uap->cest, sizeof(struct ffclock_estimate)); return (error); } #else /* !FFCLOCK */ int sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap) { return (ENOSYS); } int sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap) { return (ENOSYS); } int sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap) { return (ENOSYS); } #endif /* FFCLOCK */ Index: head/sys/kern/kern_hhook.c =================================================================== --- head/sys/kern/kern_hhook.c (revision 326270) +++ head/sys/kern/kern_hhook.c (revision 326271) @@ -1,522 +1,524 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010,2013 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Lawrence Stewart while studying at the Centre * for Advanced Internet Architectures, Swinburne University of Technology, * made possible in part by grants from the FreeBSD Foundation and Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include struct hhook { hhook_func_t hhk_func; struct helper *hhk_helper; void *hhk_udata; STAILQ_ENTRY(hhook) hhk_next; }; static MALLOC_DEFINE(M_HHOOK, "hhook", "Helper hooks are linked off hhook_head lists"); LIST_HEAD(hhookheadhead, hhook_head); struct hhookheadhead hhook_head_list; VNET_DEFINE(struct hhookheadhead, hhook_vhead_list); #define V_hhook_vhead_list VNET(hhook_vhead_list) static struct mtx hhook_head_list_lock; MTX_SYSINIT(hhookheadlistlock, &hhook_head_list_lock, "hhook_head list lock", MTX_DEF); /* Protected by hhook_head_list_lock. */ static uint32_t n_hhookheads; /* Private function prototypes. */ static void hhook_head_destroy(struct hhook_head *hhh); void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags); #define HHHLIST_LOCK() mtx_lock(&hhook_head_list_lock) #define HHHLIST_UNLOCK() mtx_unlock(&hhook_head_list_lock) #define HHHLIST_LOCK_ASSERT() mtx_assert(&hhook_head_list_lock, MA_OWNED) #define HHH_LOCK_INIT(hhh) rm_init(&(hhh)->hhh_lock, "hhook_head rm lock") #define HHH_LOCK_DESTROY(hhh) rm_destroy(&(hhh)->hhh_lock) #define HHH_WLOCK(hhh) rm_wlock(&(hhh)->hhh_lock) #define HHH_WUNLOCK(hhh) rm_wunlock(&(hhh)->hhh_lock) #define HHH_RLOCK(hhh, rmpt) rm_rlock(&(hhh)->hhh_lock, (rmpt)) #define HHH_RUNLOCK(hhh, rmpt) rm_runlock(&(hhh)->hhh_lock, (rmpt)) /* * Run all helper hook functions for a given hook point. */ void hhook_run_hooks(struct hhook_head *hhh, void *ctx_data, struct osd *hosd) { struct hhook *hhk; void *hdata; struct rm_priotracker rmpt; KASSERT(hhh->hhh_refcount > 0, ("hhook_head %p refcount is 0", hhh)); HHH_RLOCK(hhh, &rmpt); STAILQ_FOREACH(hhk, &hhh->hhh_hooks, hhk_next) { if (hhk->hhk_helper != NULL && hhk->hhk_helper->h_flags & HELPER_NEEDS_OSD) { hdata = osd_get(OSD_KHELP, hosd, hhk->hhk_helper->h_id); if (hdata == NULL) continue; } else hdata = NULL; /* * XXXLAS: We currently ignore the int returned by the hook, * but will likely want to handle it in future to allow hhook to * be used like pfil and effect changes at the hhook calling * site e.g. we could define a new hook type of HHOOK_TYPE_PFIL * and standardise what particular return values mean and set * the context data to pass exactly the same information as pfil * hooks currently receive, thus replicating pfil with hhook. */ hhk->hhk_func(hhh->hhh_type, hhh->hhh_id, hhk->hhk_udata, ctx_data, hdata, hosd); } HHH_RUNLOCK(hhh, &rmpt); } /* * Register a new helper hook function with a helper hook point. */ int hhook_add_hook(struct hhook_head *hhh, struct hookinfo *hki, uint32_t flags) { struct hhook *hhk, *tmp; int error; error = 0; if (hhh == NULL) return (ENOENT); hhk = malloc(sizeof(struct hhook), M_HHOOK, M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT)); if (hhk == NULL) return (ENOMEM); hhk->hhk_helper = hki->hook_helper; hhk->hhk_func = hki->hook_func; hhk->hhk_udata = hki->hook_udata; HHH_WLOCK(hhh); STAILQ_FOREACH(tmp, &hhh->hhh_hooks, hhk_next) { if (tmp->hhk_func == hki->hook_func && tmp->hhk_udata == hki->hook_udata) { /* The helper hook function is already registered. */ error = EEXIST; break; } } if (!error) { STAILQ_INSERT_TAIL(&hhh->hhh_hooks, hhk, hhk_next); hhh->hhh_nhooks++; } else free(hhk, M_HHOOK); HHH_WUNLOCK(hhh); return (error); } /* * Register a helper hook function with a helper hook point (including all * virtual instances of the hook point if it is virtualised). * * The logic is unfortunately far more complex than for * hhook_remove_hook_lookup() because hhook_add_hook() can call malloc() with * M_WAITOK and thus we cannot call hhook_add_hook() with the * hhook_head_list_lock held. * * The logic assembles an array of hhook_head structs that correspond to the * helper hook point being hooked and bumps the refcount on each (all done with * the hhook_head_list_lock held). The hhook_head_list_lock is then dropped, and * hhook_add_hook() is called and the refcount dropped for each hhook_head * struct in the array. */ int hhook_add_hook_lookup(struct hookinfo *hki, uint32_t flags) { struct hhook_head **heads_to_hook, *hhh; int error, i, n_heads_to_hook; tryagain: error = i = 0; /* * Accessing n_hhookheads without hhook_head_list_lock held opens up a * race with hhook_head_register() which we are unlikely to lose, but * nonetheless have to cope with - hence the complex goto logic. */ n_heads_to_hook = n_hhookheads; heads_to_hook = malloc(n_heads_to_hook * sizeof(struct hhook_head *), M_HHOOK, flags & HHOOK_WAITOK ? M_WAITOK : M_NOWAIT); if (heads_to_hook == NULL) return (ENOMEM); HHHLIST_LOCK(); LIST_FOREACH(hhh, &hhook_head_list, hhh_next) { if (hhh->hhh_type == hki->hook_type && hhh->hhh_id == hki->hook_id) { if (i < n_heads_to_hook) { heads_to_hook[i] = hhh; refcount_acquire(&heads_to_hook[i]->hhh_refcount); i++; } else { /* * We raced with hhook_head_register() which * inserted a hhook_head that we need to hook * but did not malloc space for. Abort this run * and try again. */ for (i--; i >= 0; i--) refcount_release(&heads_to_hook[i]->hhh_refcount); free(heads_to_hook, M_HHOOK); HHHLIST_UNLOCK(); goto tryagain; } } } HHHLIST_UNLOCK(); for (i--; i >= 0; i--) { if (!error) error = hhook_add_hook(heads_to_hook[i], hki, flags); refcount_release(&heads_to_hook[i]->hhh_refcount); } free(heads_to_hook, M_HHOOK); return (error); } /* * Remove a helper hook function from a helper hook point. */ int hhook_remove_hook(struct hhook_head *hhh, struct hookinfo *hki) { struct hhook *tmp; if (hhh == NULL) return (ENOENT); HHH_WLOCK(hhh); STAILQ_FOREACH(tmp, &hhh->hhh_hooks, hhk_next) { if (tmp->hhk_func == hki->hook_func && tmp->hhk_udata == hki->hook_udata) { STAILQ_REMOVE(&hhh->hhh_hooks, tmp, hhook, hhk_next); free(tmp, M_HHOOK); hhh->hhh_nhooks--; break; } } HHH_WUNLOCK(hhh); return (0); } /* * Remove a helper hook function from a helper hook point (including all * virtual instances of the hook point if it is virtualised). */ int hhook_remove_hook_lookup(struct hookinfo *hki) { struct hhook_head *hhh; HHHLIST_LOCK(); LIST_FOREACH(hhh, &hhook_head_list, hhh_next) { if (hhh->hhh_type == hki->hook_type && hhh->hhh_id == hki->hook_id) hhook_remove_hook(hhh, hki); } HHHLIST_UNLOCK(); return (0); } /* * Register a new helper hook point. */ int hhook_head_register(int32_t hhook_type, int32_t hhook_id, struct hhook_head **hhh, uint32_t flags) { struct hhook_head *tmphhh; tmphhh = hhook_head_get(hhook_type, hhook_id); if (tmphhh != NULL) { /* Hook point previously registered. */ hhook_head_release(tmphhh); return (EEXIST); } tmphhh = malloc(sizeof(struct hhook_head), M_HHOOK, M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT)); if (tmphhh == NULL) return (ENOMEM); tmphhh->hhh_type = hhook_type; tmphhh->hhh_id = hhook_id; tmphhh->hhh_nhooks = 0; STAILQ_INIT(&tmphhh->hhh_hooks); HHH_LOCK_INIT(tmphhh); refcount_init(&tmphhh->hhh_refcount, 1); HHHLIST_LOCK(); if (flags & HHOOK_HEADISINVNET) { tmphhh->hhh_flags |= HHH_ISINVNET; #ifdef VIMAGE KASSERT(curvnet != NULL, ("curvnet is NULL")); tmphhh->hhh_vid = (uintptr_t)curvnet; LIST_INSERT_HEAD(&V_hhook_vhead_list, tmphhh, hhh_vnext); #endif } LIST_INSERT_HEAD(&hhook_head_list, tmphhh, hhh_next); n_hhookheads++; HHHLIST_UNLOCK(); khelp_new_hhook_registered(tmphhh, flags); if (hhh != NULL) *hhh = tmphhh; else refcount_release(&tmphhh->hhh_refcount); return (0); } static void hhook_head_destroy(struct hhook_head *hhh) { struct hhook *tmp, *tmp2; HHHLIST_LOCK_ASSERT(); KASSERT(n_hhookheads > 0, ("n_hhookheads should be > 0")); LIST_REMOVE(hhh, hhh_next); #ifdef VIMAGE if (hhook_head_is_virtualised(hhh) == HHOOK_HEADISINVNET) LIST_REMOVE(hhh, hhh_vnext); #endif HHH_WLOCK(hhh); STAILQ_FOREACH_SAFE(tmp, &hhh->hhh_hooks, hhk_next, tmp2) free(tmp, M_HHOOK); HHH_WUNLOCK(hhh); HHH_LOCK_DESTROY(hhh); free(hhh, M_HHOOK); n_hhookheads--; } /* * Remove a helper hook point. */ int hhook_head_deregister(struct hhook_head *hhh) { int error; error = 0; HHHLIST_LOCK(); if (hhh == NULL) error = ENOENT; else if (hhh->hhh_refcount > 1) error = EBUSY; else hhook_head_destroy(hhh); HHHLIST_UNLOCK(); return (error); } /* * Remove a helper hook point via a hhook_head lookup. */ int hhook_head_deregister_lookup(int32_t hhook_type, int32_t hhook_id) { struct hhook_head *hhh; int error; hhh = hhook_head_get(hhook_type, hhook_id); error = hhook_head_deregister(hhh); if (error == EBUSY) hhook_head_release(hhh); return (error); } /* * Lookup and return the hhook_head struct associated with the specified type * and id, or NULL if not found. If found, the hhook_head's refcount is bumped. */ struct hhook_head * hhook_head_get(int32_t hhook_type, int32_t hhook_id) { struct hhook_head *hhh; HHHLIST_LOCK(); LIST_FOREACH(hhh, &hhook_head_list, hhh_next) { if (hhh->hhh_type == hhook_type && hhh->hhh_id == hhook_id) { #ifdef VIMAGE if (hhook_head_is_virtualised(hhh) == HHOOK_HEADISINVNET) { KASSERT(curvnet != NULL, ("curvnet is NULL")); if (hhh->hhh_vid != (uintptr_t)curvnet) continue; } #endif refcount_acquire(&hhh->hhh_refcount); break; } } HHHLIST_UNLOCK(); return (hhh); } void hhook_head_release(struct hhook_head *hhh) { refcount_release(&hhh->hhh_refcount); } /* * Check the hhook_head private flags and return the appropriate public * representation of the flag to the caller. The function is implemented in a * way that allows us to cope with other subsystems becoming virtualised in the * future. */ uint32_t hhook_head_is_virtualised(struct hhook_head *hhh) { uint32_t ret; ret = 0; if (hhh != NULL) { if (hhh->hhh_flags & HHH_ISINVNET) ret = HHOOK_HEADISINVNET; } return (ret); } uint32_t hhook_head_is_virtualised_lookup(int32_t hook_type, int32_t hook_id) { struct hhook_head *hhh; uint32_t ret; hhh = hhook_head_get(hook_type, hook_id); if (hhh == NULL) return (0); ret = hhook_head_is_virtualised(hhh); hhook_head_release(hhh); return (ret); } /* * Vnet created and being initialised. */ static void hhook_vnet_init(const void *unused __unused) { LIST_INIT(&V_hhook_vhead_list); } /* * Vnet being torn down and destroyed. */ static void hhook_vnet_uninit(const void *unused __unused) { struct hhook_head *hhh, *tmphhh; /* * If subsystems which export helper hook points use the hhook KPI * correctly, the loop below should have no work to do because the * subsystem should have already called hhook_head_deregister(). */ HHHLIST_LOCK(); LIST_FOREACH_SAFE(hhh, &V_hhook_vhead_list, hhh_vnext, tmphhh) { printf("%s: hhook_head type=%d, id=%d cleanup required\n", __func__, hhh->hhh_type, hhh->hhh_id); hhook_head_destroy(hhh); } HHHLIST_UNLOCK(); } /* * When a vnet is created and being initialised, init the V_hhook_vhead_list. */ VNET_SYSINIT(hhook_vnet_init, SI_SUB_INIT_IF, SI_ORDER_FIRST, hhook_vnet_init, NULL); /* * The hhook KPI provides a mechanism for subsystems which export helper hook * points to clean up on vnet tear down, but in case the KPI is misused, * provide a function to clean up and free memory for a vnet being destroyed. */ VNET_SYSUNINIT(hhook_vnet_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, hhook_vnet_uninit, NULL); Index: head/sys/kern/kern_idle.c =================================================================== --- head/sys/kern/kern_idle.c (revision 326270) +++ head/sys/kern/kern_idle.c (revision 326271) @@ -1,86 +1,88 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (C) 2000-2004 The FreeBSD Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif static void idle_setup(void *dummy); SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL); /* * Set up per-cpu idle process contexts. The AP's shouldn't be running or * accessing their idle processes at this point, so don't bother with * locking. */ static void idle_setup(void *dummy) { #ifdef SMP struct pcpu *pc; #endif struct proc *p; struct thread *td; int error; p = NULL; /* start with no idle process */ #ifdef SMP STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { #endif #ifdef SMP error = kproc_kthread_add(sched_idletd, NULL, &p, &td, RFSTOPPED | RFHIGHPID, 0, "idle", "idle: cpu%d", pc->pc_cpuid); pc->pc_idlethread = td; #else error = kproc_kthread_add(sched_idletd, NULL, &p, &td, RFSTOPPED | RFHIGHPID, 0, "idle", "idle"); PCPU_SET(idlethread, td); #endif if (error) panic("idle_setup: kproc_create error %d\n", error); thread_lock(td); TD_SET_CAN_RUN(td); td->td_flags |= TDF_IDLETD | TDF_NOLOAD; sched_class(td, PRI_IDLE); sched_prio(td, PRI_MAX_IDLE); thread_unlock(td); #ifdef SMP } #endif } Index: head/sys/kern/kern_intr.c =================================================================== --- head/sys/kern/kern_intr.c (revision 326270) +++ head/sys/kern/kern_intr.c (revision 326271) @@ -1,2008 +1,2010 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1997, Stefan Esser * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_usage_prof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif /* * Describe an interrupt thread. There is one of these per interrupt event. */ struct intr_thread { struct intr_event *it_event; struct thread *it_thread; /* Kernel thread. */ int it_flags; /* (j) IT_* flags. */ int it_need; /* Needs service. */ }; /* Interrupt thread flags kept in it_flags */ #define IT_DEAD 0x000001 /* Thread is waiting to exit. */ #define IT_WAIT 0x000002 /* Thread is waiting for completion. */ struct intr_entropy { struct thread *td; uintptr_t event; }; struct intr_event *clk_intr_event; struct intr_event *tty_intr_event; void *vm_ih; struct proc *intrproc; static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); static int intr_storm_threshold = 1000; SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN, &intr_storm_threshold, 0, "Number of consecutive interrupts before storm protection is enabled"); static TAILQ_HEAD(, intr_event) event_list = TAILQ_HEAD_INITIALIZER(event_list); static struct mtx event_lock; MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF); static void intr_event_update(struct intr_event *ie); #ifdef INTR_FILTER static int intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *ithd); static int intr_filter_loop(struct intr_event *ie, struct trapframe *frame, struct intr_thread **ithd); static struct intr_thread *ithread_create(const char *name, struct intr_handler *ih); #else static int intr_event_schedule_thread(struct intr_event *ie); static struct intr_thread *ithread_create(const char *name); #endif static void ithread_destroy(struct intr_thread *ithread); static void ithread_execute_handlers(struct proc *p, struct intr_event *ie); #ifdef INTR_FILTER static void priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih); #endif static void ithread_loop(void *); static void ithread_update(struct intr_thread *ithd); static void start_softintr(void *); /* Map an interrupt type to an ithread priority. */ u_char intr_priority(enum intr_type flags) { u_char pri; flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET | INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV); switch (flags) { case INTR_TYPE_TTY: pri = PI_TTY; break; case INTR_TYPE_BIO: pri = PI_DISK; break; case INTR_TYPE_NET: pri = PI_NET; break; case INTR_TYPE_CAM: pri = PI_DISK; break; case INTR_TYPE_AV: pri = PI_AV; break; case INTR_TYPE_CLK: pri = PI_REALTIME; break; case INTR_TYPE_MISC: pri = PI_DULL; /* don't care */ break; default: /* We didn't specify an interrupt level. */ panic("intr_priority: no interrupt type in flags"); } return pri; } /* * Update an ithread based on the associated intr_event. */ static void ithread_update(struct intr_thread *ithd) { struct intr_event *ie; struct thread *td; u_char pri; ie = ithd->it_event; td = ithd->it_thread; /* Determine the overall priority of this event. */ if (TAILQ_EMPTY(&ie->ie_handlers)) pri = PRI_MAX_ITHD; else pri = TAILQ_FIRST(&ie->ie_handlers)->ih_pri; /* Update name and priority. */ strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif thread_lock(td); sched_prio(td, pri); thread_unlock(td); } /* * Regenerate the full name of an interrupt event and update its priority. */ static void intr_event_update(struct intr_event *ie) { struct intr_handler *ih; char *last; int missed, space; /* Start off with no entropy and just the name of the event. */ mtx_assert(&ie->ie_lock, MA_OWNED); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); ie->ie_flags &= ~IE_ENTROPY; missed = 0; space = 1; /* Run through all the handlers updating values. */ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 < sizeof(ie->ie_fullname)) { strcat(ie->ie_fullname, " "); strcat(ie->ie_fullname, ih->ih_name); space = 0; } else missed++; if (ih->ih_flags & IH_ENTROPY) ie->ie_flags |= IE_ENTROPY; } /* * If the handler names were too long, add +'s to indicate missing * names. If we run out of room and still have +'s to add, change * the last character from a + to a *. */ last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2]; while (missed-- > 0) { if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) { if (*last == '+') { *last = '*'; break; } else *last = '+'; } else if (space) { strcat(ie->ie_fullname, " +"); space = 0; } else strcat(ie->ie_fullname, "+"); } /* * If this event has an ithread, update it's priority and * name. */ if (ie->ie_thread != NULL) ithread_update(ie->ie_thread); CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname); } int intr_event_create(struct intr_event **event, void *source, int flags, int irq, void (*pre_ithread)(void *), void (*post_ithread)(void *), void (*post_filter)(void *), int (*assign_cpu)(void *, int), const char *fmt, ...) { struct intr_event *ie; va_list ap; /* The only valid flag during creation is IE_SOFT. */ if ((flags & ~IE_SOFT) != 0) return (EINVAL); ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO); ie->ie_source = source; ie->ie_pre_ithread = pre_ithread; ie->ie_post_ithread = post_ithread; ie->ie_post_filter = post_filter; ie->ie_assign_cpu = assign_cpu; ie->ie_flags = flags; ie->ie_irq = irq; ie->ie_cpu = NOCPU; TAILQ_INIT(&ie->ie_handlers); mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF); va_start(ap, fmt); vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap); va_end(ap); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); mtx_lock(&event_lock); TAILQ_INSERT_TAIL(&event_list, ie, ie_list); mtx_unlock(&event_lock); if (event != NULL) *event = ie; CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name); return (0); } /* * Bind an interrupt event to the specified CPU. Note that not all * platforms support binding an interrupt to a CPU. For those * platforms this request will fail. Using a cpu id of NOCPU unbinds * the interrupt event. */ static int _intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread) { lwpid_t id; int error; /* Need a CPU to bind to. */ if (cpu != NOCPU && CPU_ABSENT(cpu)) return (EINVAL); if (ie->ie_assign_cpu == NULL) return (EOPNOTSUPP); error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR); if (error) return (error); /* * If we have any ithreads try to set their mask first to verify * permissions, etc. */ if (bindithread) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_setithread(id, cpu); if (error) return (error); } else mtx_unlock(&ie->ie_lock); } if (bindirq) error = ie->ie_assign_cpu(ie->ie_source, cpu); if (error) { if (bindithread) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { cpu = ie->ie_cpu; id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); (void)cpuset_setithread(id, cpu); } else mtx_unlock(&ie->ie_lock); } return (error); } if (bindirq) { mtx_lock(&ie->ie_lock); ie->ie_cpu = cpu; mtx_unlock(&ie->ie_lock); } return (error); } /* * Bind an interrupt event to the specified CPU. For supported platforms, any * associated ithreads as well as the primary interrupt context will be bound * to the specificed CPU. */ int intr_event_bind(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, true, true)); } /* * Bind an interrupt event to the specified CPU, but do not bind associated * ithreads. */ int intr_event_bind_irqonly(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, true, false)); } /* * Bind an interrupt event's ithread to the specified CPU. */ int intr_event_bind_ithread(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, false, true)); } static struct intr_event * intr_lookup(int irq) { struct intr_event *ie; mtx_lock(&event_lock); TAILQ_FOREACH(ie, &event_list, ie_list) if (ie->ie_irq == irq && (ie->ie_flags & IE_SOFT) == 0 && TAILQ_FIRST(&ie->ie_handlers) != NULL) break; mtx_unlock(&event_lock); return (ie); } int intr_setaffinity(int irq, int mode, void *m) { struct intr_event *ie; cpuset_t *mask; int cpu, n; mask = m; cpu = NOCPU; /* * If we're setting all cpus we can unbind. Otherwise make sure * only one cpu is in the set. */ if (CPU_CMP(cpuset_root, mask)) { for (n = 0; n < CPU_SETSIZE; n++) { if (!CPU_ISSET(n, mask)) continue; if (cpu != NOCPU) return (EINVAL); cpu = n; } } ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); switch (mode) { case CPU_WHICH_IRQ: return (intr_event_bind(ie, cpu)); case CPU_WHICH_INTRHANDLER: return (intr_event_bind_irqonly(ie, cpu)); case CPU_WHICH_ITHREAD: return (intr_event_bind_ithread(ie, cpu)); default: return (EINVAL); } } int intr_getaffinity(int irq, int mode, void *m) { struct intr_event *ie; struct thread *td; struct proc *p; cpuset_t *mask; lwpid_t id; int error; mask = m; ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); error = 0; CPU_ZERO(mask); switch (mode) { case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: mtx_lock(&ie->ie_lock); if (ie->ie_cpu == NOCPU) CPU_COPY(cpuset_root, mask); else CPU_SET(ie->ie_cpu, mask); mtx_unlock(&ie->ie_lock); break; case CPU_WHICH_ITHREAD: mtx_lock(&ie->ie_lock); if (ie->ie_thread == NULL) { mtx_unlock(&ie->ie_lock); CPU_COPY(cpuset_root, mask); } else { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL); if (error != 0) return (error); CPU_COPY(&td->td_cpuset->cs_mask, mask); PROC_UNLOCK(p); } default: return (EINVAL); } return (0); } int intr_event_destroy(struct intr_event *ie) { mtx_lock(&event_lock); mtx_lock(&ie->ie_lock); if (!TAILQ_EMPTY(&ie->ie_handlers)) { mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); return (EBUSY); } TAILQ_REMOVE(&event_list, ie, ie_list); #ifndef notyet if (ie->ie_thread != NULL) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); mtx_destroy(&ie->ie_lock); free(ie, M_ITHREAD); return (0); } #ifndef INTR_FILTER static struct intr_thread * ithread_create(const char *name) { struct intr_thread *ithd; struct thread *td; int error; ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO); error = kproc_kthread_add(ithread_loop, ithd, &intrproc, &td, RFSTOPPED | RFHIGHPID, 0, "intr", "%s", name); if (error) panic("kproc_create() failed with %d", error); thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); return (ithd); } #else static struct intr_thread * ithread_create(const char *name, struct intr_handler *ih) { struct intr_thread *ithd; struct thread *td; int error; ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO); error = kproc_kthread_add(ithread_loop, ih, &intrproc, &td, RFSTOPPED | RFHIGHPID, 0, "intr", "%s", name); if (error) panic("kproc_create() failed with %d", error); thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); return (ithd); } #endif static void ithread_destroy(struct intr_thread *ithread) { struct thread *td; CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name); td = ithread->it_thread; thread_lock(td); ithread->it_flags |= IT_DEAD; if (TD_AWAITING_INTR(td)) { TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } thread_unlock(td); } #ifndef INTR_FILTER int intr_event_add_handler(struct intr_event *ie, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep) { struct intr_handler *ih, *temp_ih; struct intr_thread *it; if (ie == NULL || name == NULL || (handler == NULL && filter == NULL)) return (EINVAL); /* Allocate and populate an interrupt handler structure. */ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO); ih->ih_filter = filter; ih->ih_handler = handler; ih->ih_argument = arg; strlcpy(ih->ih_name, name, sizeof(ih->ih_name)); ih->ih_event = ie; ih->ih_pri = pri; if (flags & INTR_EXCL) ih->ih_flags = IH_EXCLUSIVE; if (flags & INTR_MPSAFE) ih->ih_flags |= IH_MPSAFE; if (flags & INTR_ENTROPY) ih->ih_flags |= IH_ENTROPY; /* We can only have one exclusive handler in a event. */ mtx_lock(&ie->ie_lock); if (!TAILQ_EMPTY(&ie->ie_handlers)) { if ((flags & INTR_EXCL) || (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) { mtx_unlock(&ie->ie_lock); free(ih, M_ITHREAD); return (EINVAL); } } /* Create a thread if we need one. */ while (ie->ie_thread == NULL && handler != NULL) { if (ie->ie_flags & IE_ADDING_THREAD) msleep(ie, &ie->ie_lock, 0, "ithread", 0); else { ie->ie_flags |= IE_ADDING_THREAD; mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn"); mtx_lock(&ie->ie_lock); ie->ie_flags &= ~IE_ADDING_THREAD; ie->ie_thread = it; it->it_event = ie; ithread_update(it); wakeup(ie); } } /* Add the new handler to the event in priority order. */ TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) { if (temp_ih->ih_pri > ih->ih_pri) break; } if (temp_ih == NULL) TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next); else TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next); intr_event_update(ie); CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name, ie->ie_name); mtx_unlock(&ie->ie_lock); if (cookiep != NULL) *cookiep = ih; return (0); } #else int intr_event_add_handler(struct intr_event *ie, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep) { struct intr_handler *ih, *temp_ih; struct intr_thread *it; if (ie == NULL || name == NULL || (handler == NULL && filter == NULL)) return (EINVAL); /* Allocate and populate an interrupt handler structure. */ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO); ih->ih_filter = filter; ih->ih_handler = handler; ih->ih_argument = arg; strlcpy(ih->ih_name, name, sizeof(ih->ih_name)); ih->ih_event = ie; ih->ih_pri = pri; if (flags & INTR_EXCL) ih->ih_flags = IH_EXCLUSIVE; if (flags & INTR_MPSAFE) ih->ih_flags |= IH_MPSAFE; if (flags & INTR_ENTROPY) ih->ih_flags |= IH_ENTROPY; /* We can only have one exclusive handler in a event. */ mtx_lock(&ie->ie_lock); if (!TAILQ_EMPTY(&ie->ie_handlers)) { if ((flags & INTR_EXCL) || (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) { mtx_unlock(&ie->ie_lock); free(ih, M_ITHREAD); return (EINVAL); } } /* For filtered handlers, create a private ithread to run on. */ if (filter != NULL && handler != NULL) { mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn", ih); mtx_lock(&ie->ie_lock); it->it_event = ie; ih->ih_thread = it; ithread_update(it); /* XXX - do we really need this?!?!? */ } else { /* Create the global per-event thread if we need one. */ while (ie->ie_thread == NULL && handler != NULL) { if (ie->ie_flags & IE_ADDING_THREAD) msleep(ie, &ie->ie_lock, 0, "ithread", 0); else { ie->ie_flags |= IE_ADDING_THREAD; mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn", ih); mtx_lock(&ie->ie_lock); ie->ie_flags &= ~IE_ADDING_THREAD; ie->ie_thread = it; it->it_event = ie; ithread_update(it); wakeup(ie); } } } /* Add the new handler to the event in priority order. */ TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) { if (temp_ih->ih_pri > ih->ih_pri) break; } if (temp_ih == NULL) TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next); else TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next); intr_event_update(ie); CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name, ie->ie_name); mtx_unlock(&ie->ie_lock); if (cookiep != NULL) *cookiep = ih; return (0); } #endif /* * Append a description preceded by a ':' to the name of the specified * interrupt handler. */ int intr_event_describe_handler(struct intr_event *ie, void *cookie, const char *descr) { struct intr_handler *ih; size_t space; char *start; mtx_lock(&ie->ie_lock); #ifdef INVARIANTS TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih == cookie) break; } if (ih == NULL) { mtx_unlock(&ie->ie_lock); panic("handler %p not found in interrupt event %p", cookie, ie); } #endif ih = cookie; /* * Look for an existing description by checking for an * existing ":". This assumes device names do not include * colons. If one is found, prepare to insert the new * description at that point. If one is not found, find the * end of the name to use as the insertion point. */ start = strchr(ih->ih_name, ':'); if (start == NULL) start = strchr(ih->ih_name, 0); /* * See if there is enough remaining room in the string for the * description + ":". The "- 1" leaves room for the trailing * '\0'. The "+ 1" accounts for the colon. */ space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1; if (strlen(descr) + 1 > space) { mtx_unlock(&ie->ie_lock); return (ENOSPC); } /* Append a colon followed by the description. */ *start = ':'; strcpy(start + 1, descr); intr_event_update(ie); mtx_unlock(&ie->ie_lock); return (0); } /* * Return the ie_source field from the intr_event an intr_handler is * associated with. */ void * intr_handler_source(void *cookie) { struct intr_handler *ih; struct intr_event *ie; ih = (struct intr_handler *)cookie; if (ih == NULL) return (NULL); ie = ih->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", ih->ih_name)); return (ie->ie_source); } /* * Sleep until an ithread finishes executing an interrupt handler. * * XXX Doesn't currently handle interrupt filters or fast interrupt * handlers. This is intended for compatibility with linux drivers * only. Do not use in BSD code. */ void _intr_drain(int irq) { struct intr_event *ie; struct intr_thread *ithd; struct thread *td; ie = intr_lookup(irq); if (ie == NULL) return; if (ie->ie_thread == NULL) return; ithd = ie->ie_thread; td = ithd->it_thread; /* * We set the flag and wait for it to be cleared to avoid * long delays with potentially busy interrupt handlers * were we to only sample TD_AWAITING_INTR() every tick. */ thread_lock(td); if (!TD_AWAITING_INTR(td)) { ithd->it_flags |= IT_WAIT; while (ithd->it_flags & IT_WAIT) { thread_unlock(td); pause("idrain", 1); thread_lock(td); } } thread_unlock(td); return; } #ifndef INTR_FILTER int intr_event_remove_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; #ifdef INVARIANTS struct intr_handler *ih; #endif #ifdef notyet int dead; #endif if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name, ie->ie_name); #ifdef INVARIANTS TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) if (ih == handler) goto ok; mtx_unlock(&ie->ie_lock); panic("interrupt handler \"%s\" not found in interrupt event \"%s\"", ih->ih_name, ie->ie_name); ok: #endif /* * If there is no ithread, then just remove the handler and return. * XXX: Note that an INTR_FAST handler might be running on another * CPU! */ if (ie->ie_thread == NULL) { TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } /* * If the interrupt thread is already running, then just mark this * handler as being dead and let the ithread do the actual removal. * * During a cold boot while cold is set, msleep() does not sleep, * so we have to remove the handler here rather than letting the * thread do it. */ thread_lock(ie->ie_thread->it_thread); if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) { handler->ih_flags |= IH_DEAD; /* * Ensure that the thread will process the handler list * again and remove this handler if it has already passed * it on the list. * * The release part of the following store ensures * that the update of ih_flags is ordered before the * it_need setting. See the comment before * atomic_cmpset_acq(&ithd->it_need, ...) operation in * the ithread_execute_handlers(). */ atomic_store_rel_int(&ie->ie_thread->it_need, 1); } else TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); thread_unlock(ie->ie_thread->it_thread); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); intr_event_update(ie); #ifdef notyet /* * XXX: This could be bad in the case of ppbus(8). Also, I think * this could lead to races of stale data when servicing an * interrupt. */ dead = 1; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (!(ih->ih_flags & IH_FAST)) { dead = 0; break; } } if (dead) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } static int intr_event_schedule_thread(struct intr_event *ie) { struct intr_entropy entropy; struct intr_thread *it; struct thread *td; struct thread *ctd; struct proc *p; /* * If no ithread or no handlers, then we have a stray interrupt. */ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || ie->ie_thread == NULL) return (EINVAL); ctd = curthread; it = ie->ie_thread; td = it->it_thread; p = td->td_proc; /* * If any of the handlers for this ithread claim to be good * sources of entropy, then gather some. */ if (ie->ie_flags & IE_ENTROPY) { entropy.event = (uintptr_t)ie; entropy.td = ctd; random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT); } KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name)); /* * Set it_need to tell the thread to keep running if it is already * running. Then, lock the thread and see if we actually need to * put it on the runqueue. * * Use store_rel to arrange that the store to ih_need in * swi_sched() is before the store to it_need and prepare for * transfer of this order to loads in the ithread. */ atomic_store_rel_int(&it->it_need, 1); thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid, td->td_name); TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } else { CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, p->p_pid, td->td_name, it->it_need, td->td_state); } thread_unlock(td); return (0); } #else int intr_event_remove_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; struct intr_thread *it; #ifdef INVARIANTS struct intr_handler *ih; #endif #ifdef notyet int dead; #endif if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name, ie->ie_name); #ifdef INVARIANTS TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) if (ih == handler) goto ok; mtx_unlock(&ie->ie_lock); panic("interrupt handler \"%s\" not found in interrupt event \"%s\"", ih->ih_name, ie->ie_name); ok: #endif /* * If there are no ithreads (per event and per handler), then * just remove the handler and return. * XXX: Note that an INTR_FAST handler might be running on another CPU! */ if (ie->ie_thread == NULL && handler->ih_thread == NULL) { TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } /* Private or global ithread? */ it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread; /* * If the interrupt thread is already running, then just mark this * handler as being dead and let the ithread do the actual removal. * * During a cold boot while cold is set, msleep() does not sleep, * so we have to remove the handler here rather than letting the * thread do it. */ thread_lock(it->it_thread); if (!TD_AWAITING_INTR(it->it_thread) && !cold) { handler->ih_flags |= IH_DEAD; /* * Ensure that the thread will process the handler list * again and remove this handler if it has already passed * it on the list. * * The release part of the following store ensures * that the update of ih_flags is ordered before the * it_need setting. See the comment before * atomic_cmpset_acq(&ithd->it_need, ...) operation in * the ithread_execute_handlers(). */ atomic_store_rel_int(&it->it_need, 1); } else TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); thread_unlock(it->it_thread); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); /* * At this point, the handler has been disconnected from the event, * so we can kill the private ithread if any. */ if (handler->ih_thread) { ithread_destroy(handler->ih_thread); handler->ih_thread = NULL; } intr_event_update(ie); #ifdef notyet /* * XXX: This could be bad in the case of ppbus(8). Also, I think * this could lead to races of stale data when servicing an * interrupt. */ dead = 1; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (handler != NULL) { dead = 0; break; } } if (dead) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } static int intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it) { struct intr_entropy entropy; struct thread *td; struct thread *ctd; struct proc *p; /* * If no ithread or no handlers, then we have a stray interrupt. */ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL) return (EINVAL); ctd = curthread; td = it->it_thread; p = td->td_proc; /* * If any of the handlers for this ithread claim to be good * sources of entropy, then gather some. */ if (ie->ie_flags & IE_ENTROPY) { entropy.event = (uintptr_t)ie; entropy.td = ctd; random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT); } KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name)); /* * Set it_need to tell the thread to keep running if it is already * running. Then, lock the thread and see if we actually need to * put it on the runqueue. * * Use store_rel to arrange that the store to ih_need in * swi_sched() is before the store to it_need and prepare for * transfer of this order to loads in the ithread. */ atomic_store_rel_int(&it->it_need, 1); thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid, td->td_name); TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } else { CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, p->p_pid, td->td_name, it->it_need, td->td_state); } thread_unlock(td); return (0); } #endif /* * Allow interrupt event binding for software interrupt handlers -- a no-op, * since interrupts are generated in software rather than being directed by * a PIC. */ static int swi_assign_cpu(void *arg, int cpu) { return (0); } /* * Add a software interrupt handler to a specified event. If a given event * is not specified, then a new event is created. */ int swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, void **cookiep) { struct intr_event *ie; int error; if (flags & INTR_ENTROPY) return (EINVAL); ie = (eventp != NULL) ? *eventp : NULL; if (ie != NULL) { if (!(ie->ie_flags & IE_SOFT)) return (EINVAL); } else { error = intr_event_create(&ie, NULL, IE_SOFT, 0, NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri); if (error) return (error); if (eventp != NULL) *eventp = ie; } error = intr_event_add_handler(ie, name, NULL, handler, arg, PI_SWI(pri), flags, cookiep); return (error); } /* * Schedule a software interrupt thread. */ void swi_sched(void *cookie, int flags) { struct intr_handler *ih = (struct intr_handler *)cookie; struct intr_event *ie = ih->ih_event; struct intr_entropy entropy; int error; CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name, ih->ih_need); entropy.event = (uintptr_t)ih; entropy.td = curthread; random_harvest_queue(&entropy, sizeof(entropy), 1, RANDOM_SWI); /* * Set ih_need for this handler so that if the ithread is already * running it will execute this handler on the next pass. Otherwise, * it will execute it the next time it runs. */ ih->ih_need = 1; if (!(flags & SWI_DELAY)) { VM_CNT_INC(v_soft); #ifdef INTR_FILTER error = intr_event_schedule_thread(ie, ie->ie_thread); #else error = intr_event_schedule_thread(ie); #endif KASSERT(error == 0, ("stray software interrupt")); } } /* * Remove a software interrupt handler. Currently this code does not * remove the associated interrupt event if it becomes empty. Calling code * may do so manually via intr_event_destroy(), but that's not really * an optimal interface. */ int swi_remove(void *cookie) { return (intr_event_remove_handler(cookie)); } #ifdef INTR_FILTER static void priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih) { struct intr_event *ie; ie = ih->ih_event; /* * If this handler is marked for death, remove it from * the list of handlers and wake up the sleeper. */ if (ih->ih_flags & IH_DEAD) { mtx_lock(&ie->ie_lock); TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next); ih->ih_flags &= ~IH_DEAD; wakeup(ih); mtx_unlock(&ie->ie_lock); return; } /* Execute this handler. */ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x", __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument, ih->ih_name, ih->ih_flags); if (!(ih->ih_flags & IH_MPSAFE)) mtx_lock(&Giant); ih->ih_handler(ih->ih_argument); if (!(ih->ih_flags & IH_MPSAFE)) mtx_unlock(&Giant); } #endif /* * This is a public function for use by drivers that mux interrupt * handlers for child devices from their interrupt handler. */ void intr_event_execute_handlers(struct proc *p, struct intr_event *ie) { struct intr_handler *ih, *ihn; TAILQ_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) { /* * If this handler is marked for death, remove it from * the list of handlers and wake up the sleeper. */ if (ih->ih_flags & IH_DEAD) { mtx_lock(&ie->ie_lock); TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next); ih->ih_flags &= ~IH_DEAD; wakeup(ih); mtx_unlock(&ie->ie_lock); continue; } /* Skip filter only handlers */ if (ih->ih_handler == NULL) continue; /* * For software interrupt threads, we only execute * handlers that have their need flag set. Hardware * interrupt threads always invoke all of their handlers. * * ih_need can only be 0 or 1. Failed cmpset below * means that there is no request to execute handlers, * so a retry of the cmpset is not needed. */ if ((ie->ie_flags & IE_SOFT) != 0 && atomic_cmpset_int(&ih->ih_need, 1, 0) == 0) continue; /* Execute this handler. */ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x", __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument, ih->ih_name, ih->ih_flags); if (!(ih->ih_flags & IH_MPSAFE)) mtx_lock(&Giant); ih->ih_handler(ih->ih_argument); if (!(ih->ih_flags & IH_MPSAFE)) mtx_unlock(&Giant); } } static void ithread_execute_handlers(struct proc *p, struct intr_event *ie) { /* Interrupt handlers should not sleep. */ if (!(ie->ie_flags & IE_SOFT)) THREAD_NO_SLEEPING(); intr_event_execute_handlers(p, ie); if (!(ie->ie_flags & IE_SOFT)) THREAD_SLEEPING_OK(); /* * Interrupt storm handling: * * If this interrupt source is currently storming, then throttle * it to only fire the handler once per clock tick. * * If this interrupt source is not currently storming, but the * number of back to back interrupts exceeds the storm threshold, * then enter storming mode. */ if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold && !(ie->ie_flags & IE_SOFT)) { /* Report the message only once every second. */ if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) { printf( "interrupt storm detected on \"%s\"; throttling interrupt source\n", ie->ie_name); } pause("istorm", 1); } else ie->ie_count++; /* * Now that all the handlers have had a chance to run, reenable * the interrupt source. */ if (ie->ie_post_ithread != NULL) ie->ie_post_ithread(ie->ie_source); } #ifndef INTR_FILTER /* * This is the main code for interrupt threads. */ static void ithread_loop(void *arg) { struct intr_thread *ithd; struct intr_event *ie; struct thread *td; struct proc *p; int wake; td = curthread; p = td->td_proc; ithd = (struct intr_thread *)arg; KASSERT(ithd->it_thread == td, ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; wake = 0; /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. */ for (;;) { /* * If we are an orphaned thread, then just die. */ if (ithd->it_flags & IT_DEAD) { CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__, p->p_pid, td->td_name); free(ithd, M_ITHREAD); kthread_exit(); } /* * Service interrupts. If another interrupt arrives while * we are running, it will set it_need to note that we * should make another pass. * * The load_acq part of the following cmpset ensures * that the load of ih_need in ithread_execute_handlers() * is ordered after the load of it_need here. */ while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) ithread_execute_handlers(p, ie); WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread"); mtx_assert(&Giant, MA_NOTOWNED); /* * Processed all our interrupts. Now get the sched * lock. This may take a while and it_need may get * set again, so we have to check it again. */ thread_lock(td); if (atomic_load_acq_int(&ithd->it_need) == 0 && (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT, NULL); } if (ithd->it_flags & IT_WAIT) { wake = 1; ithd->it_flags &= ~IT_WAIT; } thread_unlock(td); if (wake) { wakeup(ithd); wake = 0; } } } /* * Main interrupt handling body. * * Input: * o ie: the event connected to this interrupt. * o frame: some archs (i.e. i386) pass a frame to some. * handlers as their main argument. * Return value: * o 0: everything ok. * o EINVAL: stray interrupt. */ int intr_event_handle(struct intr_event *ie, struct trapframe *frame) { struct intr_handler *ih; struct trapframe *oldframe; struct thread *td; int error, ret, thread; td = curthread; #ifdef KSTACK_USAGE_PROF intr_prof_stack_use(td, frame); #endif /* An interrupt with no event or handlers is a stray interrupt. */ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers)) return (EINVAL); /* * Execute fast interrupt handlers directly. * To support clock handlers, if a handler registers * with a NULL argument, then we pass it a pointer to * a trapframe as its argument. */ td->td_intr_nesting_level++; thread = 0; ret = 0; critical_enter(); oldframe = td->td_intr_frame; td->td_intr_frame = frame; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih->ih_filter == NULL) { thread = 1; continue; } CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__, ih->ih_filter, ih->ih_argument == NULL ? frame : ih->ih_argument, ih->ih_name); if (ih->ih_argument == NULL) ret = ih->ih_filter(frame); else ret = ih->ih_filter(ih->ih_argument); KASSERT(ret == FILTER_STRAY || ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 && (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0), ("%s: incorrect return value %#x from %s", __func__, ret, ih->ih_name)); /* * Wrapper handler special handling: * * in some particular cases (like pccard and pccbb), * the _real_ device handler is wrapped in a couple of * functions - a filter wrapper and an ithread wrapper. * In this case (and just in this case), the filter wrapper * could ask the system to schedule the ithread and mask * the interrupt source if the wrapped handler is composed * of just an ithread handler. * * TODO: write a generic wrapper to avoid people rolling * their own */ if (!thread) { if (ret == FILTER_SCHEDULE_THREAD) thread = 1; } } td->td_intr_frame = oldframe; if (thread) { if (ie->ie_pre_ithread != NULL) ie->ie_pre_ithread(ie->ie_source); } else { if (ie->ie_post_filter != NULL) ie->ie_post_filter(ie->ie_source); } /* Schedule the ithread if needed. */ if (thread) { error = intr_event_schedule_thread(ie); KASSERT(error == 0, ("bad stray interrupt")); } critical_exit(); td->td_intr_nesting_level--; return (0); } #else /* * This is the main code for interrupt threads. */ static void ithread_loop(void *arg) { struct intr_thread *ithd; struct intr_handler *ih; struct intr_event *ie; struct thread *td; struct proc *p; int priv; int wake; td = curthread; p = td->td_proc; ih = (struct intr_handler *)arg; priv = (ih->ih_thread != NULL) ? 1 : 0; ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread; KASSERT(ithd->it_thread == td, ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; wake = 0; /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. */ for (;;) { /* * If we are an orphaned thread, then just die. */ if (ithd->it_flags & IT_DEAD) { CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__, p->p_pid, td->td_name); free(ithd, M_ITHREAD); kthread_exit(); } /* * Service interrupts. If another interrupt arrives while * we are running, it will set it_need to note that we * should make another pass. * * The load_acq part of the following cmpset ensures * that the load of ih_need in ithread_execute_handlers() * is ordered after the load of it_need here. */ while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) { if (priv) priv_ithread_execute_handler(p, ih); else ithread_execute_handlers(p, ie); } WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread"); mtx_assert(&Giant, MA_NOTOWNED); /* * Processed all our interrupts. Now get the sched * lock. This may take a while and it_need may get * set again, so we have to check it again. */ thread_lock(td); if (atomic_load_acq_int(&ithd->it_need) == 0 && (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT, NULL); } if (ithd->it_flags & IT_WAIT) { wake = 1; ithd->it_flags &= ~IT_WAIT; } thread_unlock(td); if (wake) { wakeup(ithd); wake = 0; } } } /* * Main loop for interrupt filter. * * Some architectures (i386, amd64 and arm) require the optional frame * parameter, and use it as the main argument for fast handler execution * when ih_argument == NULL. * * Return value: * o FILTER_STRAY: No filter recognized the event, and no * filter-less handler is registered on this * line. * o FILTER_HANDLED: A filter claimed the event and served it. * o FILTER_SCHEDULE_THREAD: No filter claimed the event, but there's at * least one filter-less handler on this line. * o FILTER_HANDLED | * FILTER_SCHEDULE_THREAD: A filter claimed the event, and asked for * scheduling the per-handler ithread. * * In case an ithread has to be scheduled, in *ithd there will be a * pointer to a struct intr_thread containing the thread to be * scheduled. */ static int intr_filter_loop(struct intr_event *ie, struct trapframe *frame, struct intr_thread **ithd) { struct intr_handler *ih; void *arg; int ret, thread_only; ret = 0; thread_only = 0; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { /* * Execute fast interrupt handlers directly. * To support clock handlers, if a handler registers * with a NULL argument, then we pass it a pointer to * a trapframe as its argument. */ arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument); CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__, ih->ih_filter, ih->ih_handler, arg, ih->ih_name); if (ih->ih_filter != NULL) ret = ih->ih_filter(arg); else { thread_only = 1; continue; } KASSERT(ret == FILTER_STRAY || ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 && (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0), ("%s: incorrect return value %#x from %s", __func__, ret, ih->ih_name)); if (ret & FILTER_STRAY) continue; else { *ithd = ih->ih_thread; return (ret); } } /* * No filters handled the interrupt and we have at least * one handler without a filter. In this case, we schedule * all of the filter-less handlers to run in the ithread. */ if (thread_only) { *ithd = ie->ie_thread; return (FILTER_SCHEDULE_THREAD); } return (FILTER_STRAY); } /* * Main interrupt handling body. * * Input: * o ie: the event connected to this interrupt. * o frame: some archs (i.e. i386) pass a frame to some. * handlers as their main argument. * Return value: * o 0: everything ok. * o EINVAL: stray interrupt. */ int intr_event_handle(struct intr_event *ie, struct trapframe *frame) { struct intr_thread *ithd; struct trapframe *oldframe; struct thread *td; int thread; ithd = NULL; td = curthread; if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers)) return (EINVAL); td->td_intr_nesting_level++; thread = 0; critical_enter(); oldframe = td->td_intr_frame; td->td_intr_frame = frame; thread = intr_filter_loop(ie, frame, &ithd); if (thread & FILTER_HANDLED) { if (ie->ie_post_filter != NULL) ie->ie_post_filter(ie->ie_source); } else { if (ie->ie_pre_ithread != NULL) ie->ie_pre_ithread(ie->ie_source); } td->td_intr_frame = oldframe; critical_exit(); /* Interrupt storm logic */ if (thread & FILTER_STRAY) { ie->ie_count++; if (ie->ie_count < intr_storm_threshold) printf("Interrupt stray detection not present\n"); } /* Schedule an ithread if needed. */ if (thread & FILTER_SCHEDULE_THREAD) { if (intr_event_schedule_thread(ie, ithd) != 0) panic("%s: impossible stray interrupt", __func__); } td->td_intr_nesting_level--; return (0); } #endif #ifdef DDB /* * Dump details about an interrupt handler */ static void db_dump_intrhand(struct intr_handler *ih) { int comma; db_printf("\t%-10s ", ih->ih_name); switch (ih->ih_pri) { case PI_REALTIME: db_printf("CLK "); break; case PI_AV: db_printf("AV "); break; case PI_TTY: db_printf("TTY "); break; case PI_NET: db_printf("NET "); break; case PI_DISK: db_printf("DISK"); break; case PI_DULL: db_printf("DULL"); break; default: if (ih->ih_pri >= PI_SOFT) db_printf("SWI "); else db_printf("%4u", ih->ih_pri); break; } db_printf(" "); if (ih->ih_filter != NULL) { db_printf("[F]"); db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC); } if (ih->ih_handler != NULL) { if (ih->ih_filter != NULL) db_printf(","); db_printf("[H]"); db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC); } db_printf("(%p)", ih->ih_argument); if (ih->ih_need || (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD | IH_MPSAFE)) != 0) { db_printf(" {"); comma = 0; if (ih->ih_flags & IH_EXCLUSIVE) { if (comma) db_printf(", "); db_printf("EXCL"); comma = 1; } if (ih->ih_flags & IH_ENTROPY) { if (comma) db_printf(", "); db_printf("ENTROPY"); comma = 1; } if (ih->ih_flags & IH_DEAD) { if (comma) db_printf(", "); db_printf("DEAD"); comma = 1; } if (ih->ih_flags & IH_MPSAFE) { if (comma) db_printf(", "); db_printf("MPSAFE"); comma = 1; } if (ih->ih_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); } /* * Dump details about a event. */ void db_dump_intr_event(struct intr_event *ie, int handlers) { struct intr_handler *ih; struct intr_thread *it; int comma; db_printf("%s ", ie->ie_fullname); it = ie->ie_thread; if (it != NULL) db_printf("(pid %d)", it->it_thread->td_proc->p_pid); else db_printf("(no thread)"); if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 || (it != NULL && it->it_need)) { db_printf(" {"); comma = 0; if (ie->ie_flags & IE_SOFT) { db_printf("SOFT"); comma = 1; } if (ie->ie_flags & IE_ENTROPY) { if (comma) db_printf(", "); db_printf("ENTROPY"); comma = 1; } if (ie->ie_flags & IE_ADDING_THREAD) { if (comma) db_printf(", "); db_printf("ADDING_THREAD"); comma = 1; } if (it != NULL && it->it_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); if (handlers) TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) db_dump_intrhand(ih); } /* * Dump data about interrupt handlers */ DB_SHOW_COMMAND(intr, db_show_intr) { struct intr_event *ie; int all, verbose; verbose = strchr(modif, 'v') != NULL; all = strchr(modif, 'a') != NULL; TAILQ_FOREACH(ie, &event_list, ie_list) { if (!all && TAILQ_EMPTY(&ie->ie_handlers)) continue; db_dump_intr_event(ie, verbose); if (db_pager_quit) break; } } #endif /* DDB */ /* * Start standard software interrupt threads */ static void start_softintr(void *dummy) { if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih)) panic("died while creating vm swi ithread"); } SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL); /* * Sysctls used by systat and others: hw.intrnames and hw.intrcnt. * The data for this machine dependent, and the declarations are in machine * dependent code. The layout of intrnames and intrcnt however is machine * independent. * * We do not know the length of intrcnt and intrnames at compile time, so * calculate things at run time. */ static int sysctl_intrnames(SYSCTL_HANDLER_ARGS) { return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_intrnames, "", "Interrupt Names"); static int sysctl_intrcnt(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 uint32_t *intrcnt32; unsigned i; int error; if (req->flags & SCTL_MASK32) { if (!req->oldptr) return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req)); intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT); if (intrcnt32 == NULL) return (ENOMEM); for (i = 0; i < sintrcnt / sizeof (u_long); i++) intrcnt32[i] = intrcnt[i]; error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req); free(intrcnt32, M_TEMP); return (error); } #endif return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_intrcnt, "", "Interrupt Counts"); #ifdef DDB /* * DDB command to dump the interrupt statistics. */ DB_SHOW_COMMAND(intrcnt, db_show_intrcnt) { u_long *i; char *cp; u_int j; cp = intrnames; j = 0; for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit; i++, j++) { if (*cp == '\0') break; if (*i != 0) db_printf("%s\t%lu\n", cp, *i); cp += strlen(cp) + 1; } } #endif Index: head/sys/kern/kern_jail.c =================================================================== --- head/sys/kern/kern_jail.c (revision 326270) +++ head/sys/kern/kern_jail.c (revision 326271) @@ -1,4117 +1,4119 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ #include #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ #ifdef INET #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL #endif #else /* !INET */ #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL 0 #endif #endif /* prison0 describes what is "real" about the system. */ struct prison prison0 = { .pr_id = 0, .pr_name = "0", .pr_ref = 1, .pr_uref = 1, .pr_path = "/", .pr_securelevel = -1, .pr_devfs_rsnum = 0, .pr_childmax = JAIL_MAX, .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif .pr_allow = PR_ALLOW_ALL, }; MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); LIST_HEAD(, prison_racct) allprison_racct; int lastprid = 0; static int do_jail_attach(struct thread *td, struct prison *pr); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static char *prison_path(struct prison *pr1, struct prison *pr2); static void prison_remove_one(struct prison *pr); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif /* Flags for prison_deref */ #define PD_DEREF 0x01 #define PD_DEUREF 0x02 #define PD_LOCKED 0x04 #define PD_LIST_SLOCKED 0x08 #define PD_LIST_XLOCKED 0x10 /* * Parameter names corresponding to PR_* flag values. Size values are for kvm * as we cannot figure out the size of a sparse array, or an array without a * terminating entry. */ static char *pr_flag_names[] = { [0] = "persist", #ifdef INET [7] = "ip4.saddrsel", #endif #ifdef INET6 [8] = "ip6.saddrsel", #endif }; const size_t pr_flag_names_size = sizeof(pr_flag_names); static char *pr_flag_nonames[] = { [0] = "nopersist", #ifdef INET [7] = "ip4.nosaddrsel", #endif #ifdef INET6 [8] = "ip6.nosaddrsel", #endif }; const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames); struct jailsys_flags { const char *name; unsigned disable; unsigned new; } pr_flag_jailsys[] = { { "host", 0, PR_HOST }, #ifdef VIMAGE { "vnet", 0, PR_VNET }, #endif #ifdef INET { "ip4", PR_IP4_USER, PR_IP4_USER }, #endif #ifdef INET6 { "ip6", PR_IP6_USER, PR_IP6_USER }, #endif }; const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); static char *pr_allow_names[] = { "allow.set_hostname", "allow.sysvipc", "allow.raw_sockets", "allow.chflags", "allow.mount", "allow.quotas", "allow.socket_af", "allow.mount.devfs", "allow.mount.nullfs", "allow.mount.zfs", "allow.mount.procfs", "allow.mount.tmpfs", "allow.mount.fdescfs", "allow.mount.linprocfs", "allow.mount.linsysfs", "allow.reserved_ports", }; const size_t pr_allow_names_size = sizeof(pr_allow_names); static char *pr_allow_nonames[] = { "allow.noset_hostname", "allow.nosysvipc", "allow.noraw_sockets", "allow.nochflags", "allow.nomount", "allow.noquotas", "allow.nosocket_af", "allow.mount.nodevfs", "allow.mount.nonullfs", "allow.mount.nozfs", "allow.mount.noprocfs", "allow.mount.notmpfs", "allow.mount.nofdescfs", "allow.mount.nolinprocfs", "allow.mount.nolinsysfs", "allow.noreserved_ports", }; const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames); #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | PR_ALLOW_RESERVED_PORTS) #define JAIL_DEFAULT_ENFORCE_STATFS 2 #define JAIL_DEFAULT_DEVFS_RSNUM 0 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; #if defined(INET) || defined(INET6) static unsigned jail_max_af_ips = 255; #endif /* * Initialize the parts of prison0 that can't be static-initialized with * constants. This is called from proc0_init() after creating thread0 cpuset. */ void prison0_init(void) { prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); prison0.pr_osreldate = osreldate; strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); } /* * struct jail_args { * struct jail *jail; * }; */ int sys_jail(struct thread *td, struct jail_args *uap) { uint32_t version; int error; struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) return (error); switch (version) { case 0: { struct jail_v0 j0; /* FreeBSD single IPv4 jails. */ bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); j.version = j0.version; j.path = j0.path; j.hostname = j0.hostname; j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ break; } case 1: /* * Version 1 was used by multi-IPv4 jail implementations * that never made it into the official kernel. */ return (EINVAL); case 2: /* JAIL_API_VERSION */ /* FreeBSD multi-IPv4/IPv6,noIP jails. */ error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); break; default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } return (kern_jail(td, &j)); } int kern_jail(struct thread *td, struct jail *j) { struct iovec optiov[2 * (4 + nitems(pr_allow_names) #ifdef INET + 1 #endif #ifdef INET6 + 1 #endif )]; struct uio opt; char *u_path, *u_hostname, *u_name; #ifdef INET uint32_t ip4s; struct in_addr *u_ip4; #endif #ifdef INET6 struct in6_addr *u_ip6; #endif size_t tmplen; int error, enforce_statfs, fi; bzero(&optiov, sizeof(optiov)); opt.uio_iov = optiov; opt.uio_iovcnt = 0; opt.uio_offset = -1; opt.uio_resid = -1; opt.uio_segflg = UIO_SYSSPACE; opt.uio_rw = UIO_READ; opt.uio_td = td; /* Set permissions for top-level jails from sysctls. */ if (!jailed(td->td_ucred)) { for (fi = 0; fi < nitems(pr_allow_names); fi++) { optiov[opt.uio_iovcnt].iov_base = (jail_default_allow & (1 << fi)) ? pr_allow_names[fi] : pr_allow_nonames[fi]; optiov[opt.uio_iovcnt].iov_len = strlen(optiov[opt.uio_iovcnt].iov_base) + 1; opt.uio_iovcnt += 2; } optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); opt.uio_iovcnt++; enforce_statfs = jail_default_enforce_statfs; optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); opt.uio_iovcnt++; } tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; #ifdef INET ip4s = (j->version == 0) ? 1 : j->ip4s; if (ip4s > jail_max_af_ips) return (EINVAL); tmplen += ip4s * sizeof(struct in_addr); #else if (j->ip4s > 0) return (EINVAL); #endif #ifdef INET6 if (j->ip6s > jail_max_af_ips) return (EINVAL); tmplen += j->ip6s * sizeof(struct in6_addr); #else if (j->ip6s > 0) return (EINVAL); #endif u_path = malloc(tmplen, M_TEMP, M_WAITOK); u_hostname = u_path + MAXPATHLEN; u_name = u_hostname + MAXHOSTNAMELEN; #ifdef INET u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); #endif #ifdef INET6 #ifdef INET u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); #else u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); #endif #endif optiov[opt.uio_iovcnt].iov_base = "path"; optiov[opt.uio_iovcnt].iov_len = sizeof("path"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_path; error = copyinstr(j->path, u_path, MAXPATHLEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = "host.hostname"; optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_hostname; error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; if (j->jailname != NULL) { optiov[opt.uio_iovcnt].iov_base = "name"; optiov[opt.uio_iovcnt].iov_len = sizeof("name"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_name; error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; } #ifdef INET optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip4; optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); if (j->version == 0) u_ip4->s_addr = j->ip4s; else { error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } } opt.uio_iovcnt++; #endif #ifdef INET6 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip6; optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; #endif KASSERT(opt.uio_iovcnt <= nitems(optiov), ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } /* * struct jail_set_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_set(struct thread *td, struct jail_set_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_set(td, auio, uap->flags); free(auio, M_IOV); return (error); } int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { struct nameidata nd; #ifdef INET struct in_addr *ip4; #endif #ifdef INET6 struct in6_addr *ip6; #endif struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *mypr, *ppr, *tpr; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; #if defined(INET) || defined(INET6) struct prison *tppr; void *op; #endif unsigned long hid; size_t namelen, onamelen, pnamelen; int born, created, cuflags, descend, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int fi, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; int fullpath_disabled; #if defined(INET) || defined(INET6) int ii, ij; #endif #ifdef INET int ip4s, redo_ip4; #endif #ifdef INET6 int ip6s, redo_ip6; #endif uint64_t pr_allow, ch_allow, pr_flags, ch_flags; unsigned tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); if (!error && (flags & JAIL_ATTACH)) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); mypr = td->td_ucred->cr_prison; if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this * takes care of some expensive stuff (path lookup) before getting * the allprison lock. * * XXX Jails are not filesystems, and jail parameters are not mount * options. But it makes more sense to re-use the vfsopt code * than duplicate it under a different name. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); #ifdef INET ip4 = NULL; #endif #ifdef INET6 ip6 = NULL; #endif g_path = NULL; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { error = EINVAL; vfs_opterror(opts, "no valid operation (create or update)"); goto done_errmsg; } error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; else if (error != 0) goto done_free; error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); if (error == ENOENT) gotslevel = 0; else if (error != 0) goto done_free; else gotslevel = 1; error = vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); if (error == ENOENT) gotchildmax = 0; else if (error != 0) goto done_free; else gotchildmax = 1; error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; else if (error != 0) goto done_free; else if (enforce < 0 || enforce > 2) { error = EINVAL; goto done_free; } else gotenforce = 1; error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); if (error == ENOENT) gotrsnum = 0; else if (error != 0) goto done_free; else gotrsnum = 1; pr_flags = ch_flags = 0; for (fi = 0; fi < nitems(pr_flag_names); fi++) { if (pr_flag_names[fi] == NULL) continue; vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi); vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi); } ch_flags |= pr_flags; for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) { error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys, sizeof(jsys)); if (error == ENOENT) continue; if (error != 0) goto done_free; switch (jsys) { case JAIL_SYS_DISABLE: if (!pr_flag_jailsys[fi].disable) { error = EINVAL; goto done_free; } pr_flags |= pr_flag_jailsys[fi].disable; break; case JAIL_SYS_NEW: pr_flags |= pr_flag_jailsys[fi].new; break; case JAIL_SYS_INHERIT: break; default: error = EINVAL; goto done_free; } ch_flags |= pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable; } if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { error = EINVAL; vfs_opterror(opts, "new jail must persist or attach"); goto done_errmsg; } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET6 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_errmsg; } #endif pr_allow = ch_allow = 0; for (fi = 0; fi < nitems(pr_allow_names); fi++) { vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi); vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi); } ch_allow |= pr_allow; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; else if (error != 0) goto done_free; else { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); if (error == ENOENT) host = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || host[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); if (error == ENOENT) domain = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || domain[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); if (error == ENOENT) uuid = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || uuid[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > HOSTUUIDLEN) { error = ENAMETOOLONG; goto done_free; } } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32; error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); hid = hid32; } else #endif error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); if (error == ENOENT) gothid = 0; else if (error != 0) goto done_free; else { gothid = 1; ch_flags |= PR_HOST; pr_flags |= PR_HOST; } #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) ip4s = 0; else if (error != 0) goto done_free; else if (ip4s & (sizeof(*ip4) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP4_USER; pr_flags |= PR_IP4_USER; if (ip4s > 0) { ip4s /= sizeof(*ip4); if (ip4s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv4 addresses"); goto done_errmsg; } ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); bcopy(op, ip4, ip4s * sizeof(*ip4)); /* * IP addresses are all sorted but ip[0] to preserve * the primary IP address as given from userland. * This special IP is used for unbound outgoing * connections as well for "loopback" traffic in case * source address selection cannot find any more fitting * address to connect from. */ if (ip4s > 1) qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), prison_qcmp_v4); /* * Check for duplicate addresses and do some simple * zero and broadcast checks. If users give other bogus * addresses it is their problem. * * We do not have to care about byte order for these * checks so we will do them in NBO. */ for (ii = 0; ii < ip4s; ii++) { if (ip4[ii].s_addr == INADDR_ANY || ip4[ii].s_addr == INADDR_BROADCAST) { error = EINVAL; goto done_free; } if ((ii+1) < ip4s && (ip4[0].s_addr == ip4[ii+1].s_addr || ip4[ii].s_addr == ip4[ii+1].s_addr)) { error = EINVAL; goto done_free; } } } } #endif #ifdef INET6 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); if (error == ENOENT) ip6s = 0; else if (error != 0) goto done_free; else if (ip6s & (sizeof(*ip6) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP6_USER; pr_flags |= PR_IP6_USER; if (ip6s > 0) { ip6s /= sizeof(*ip6); if (ip6s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv6 addresses"); goto done_errmsg; } ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); bcopy(op, ip6, ip6s * sizeof(*ip6)); if (ip6s > 1) qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), prison_qcmp_v6); for (ii = 0; ii < ip6s; ii++) { if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) { error = EINVAL; goto done_free; } if ((ii+1) < ip6s && (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) { error = EINVAL; goto done_free; } } } } #endif #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_errmsg; } #endif error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); if (error == ENOENT) osrelstr = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osrelease cannot be changed after creation"); goto done_errmsg; } if (len == 0 || len >= OSRELEASELEN) { error = EINVAL; vfs_opterror(opts, "osrelease string must be 1-%d bytes long", OSRELEASELEN - 1); goto done_errmsg; } } error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); if (error == ENOENT) osreldt = 0; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be changed after creation"); goto done_errmsg; } if (osreldt == 0) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be 0"); goto done_errmsg; } } fullpath_disabled = 0; root = NULL; error = vfs_getopt(opts, "path", (void **)&path, &len); if (error == ENOENT) path = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "path cannot be changed after creation"); goto done_errmsg; } if (len == 0 || path[len - 1] != '\0') { error = EINVAL; goto done_free; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, td); error = namei(&nd); if (error) goto done_free; root = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); strlcpy(g_path, path, MAXPATHLEN); error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); if (error == 0) path = g_path; else if (error == ENODEV) { /* proceed if sysctl debug.disablefullpath == 1 */ fullpath_disabled = 1; if (len < 2 || (len == 2 && path[0] == '/')) path = NULL; } else { /* exit on other errors */ goto done_free; } if (root->v_type != VDIR) { error = ENOTDIR; vput(root); goto done_free; } VOP_UNLOCK(root, 0); if (fullpath_disabled) { /* Leave room for a real-root full pathname. */ if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/") ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) { error = ENAMETOOLONG; vrele(root); goto done_free; } } } /* * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ pr = NULL; ppr = mypr; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } sx_xlock(&allprison_lock); if (jid != 0) { /* * See if a requested jid already exists. There is an * information leak here if the jid exists but is not within * the caller's jail hierarchy. Jail creators will get EEXIST * even though they cannot see the jail, and CREATE | UPDATE * will return ENOENT which is not normally a valid error. */ if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); goto done_unlock_list; } pr = prison_find(jid); if (pr != NULL) { ppr = pr->pr_parent; /* Create: jid must not exist. */ if (cuflags == JAIL_CREATE) { mtx_unlock(&pr->pr_mtx); error = EEXIST; vfs_opterror(opts, "jail %d already exists", jid); goto done_unlock_list; } if (!prison_ischild(mypr, pr)) { mtx_unlock(&pr->pr_mtx); pr = NULL; } else if (pr->pr_uref == 0) { if (!(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_unlock_list; } else if ((flags & JAIL_ATTACH) || (pr_flags & PR_PERSIST)) { /* * A dying jail might be resurrected * (via attach or persist), but first * it must determine if another jail * has claimed its name. Accomplish * this by implicitly re-setting the * name. */ if (name == NULL) name = prison_name(mypr, pr); } } } if (pr == NULL) { /* Update: jid must exist. */ if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_unlock_list; } } } /* * If the caller provided a name, look for a jail by that name. * This has different semantics for creates and updates keyed by jid * (where the name must not already exist in a different jail), * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ namelc = NULL; if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) namelc = name; else { /* * This is a hierarchical name. Split it into the * parent and child names, and make sure the parent * exists or matches an already found jail. */ if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { mtx_unlock(&pr->pr_mtx); error = EINVAL; vfs_opterror(opts, "cannot change jail's parent"); goto done_unlock_list; } } else { *namelc = '\0'; ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } mtx_unlock(&ppr->pr_mtx); *namelc = '.'; } namelc++; } if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; name_again: deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && tpr->pr_ref > 0 && !strcmp(tpr->pr_name + pnamelen, namelc)) { if (pr == NULL && cuflags != JAIL_CREATE) { mtx_lock(&tpr->pr_mtx); if (tpr->pr_ref > 0) { /* * Use this jail * for updates. */ if (tpr->pr_uref > 0) { pr = tpr; break; } deadpr = tpr; } mtx_unlock(&tpr->pr_mtx); } else if (tpr->pr_uref > 0) { /* * Create, or update(jid): * name must not exist in an * active sibling jail. */ error = EEXIST; if (pr != NULL) mtx_unlock(&pr->pr_mtx); vfs_opterror(opts, "jail \"%s\" already exists", name); goto done_unlock_list; } } } /* If no active jail is found, use a dying one. */ if (deadpr != NULL && pr == NULL) { if (flags & JAIL_DYING) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { mtx_unlock(&deadpr->pr_mtx); goto name_again; } pr = deadpr; } else if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_unlock_list; } } /* Update: name must exist if no jid. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } } } /* Update: must provide a jid or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); goto done_unlock_list; } /* If there's no prison to update, create a new one and link it in. */ if (pr == NULL) { for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) if (tpr->pr_childcount >= tpr->pr_childmax) { error = EPERM; vfs_opterror(opts, "prison limit exceeded"); goto done_unlock_list; } created = 1; mtx_lock(&ppr->pr_mtx); if (ppr->pr_ref == 0) { mtx_unlock(&ppr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", prison_name(mypr, ppr)); goto done_unlock_list; } ppr->pr_ref++; ppr->pr_uref++; mtx_unlock(&ppr->pr_mtx); pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); if (jid == 0) { /* Find the next free jid. */ jid = lastprid + 1; findnext: if (jid == JAIL_MAX) jid = 1; TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_id < jid) continue; if (tpr->pr_id > jid || tpr->pr_ref == 0) { TAILQ_INSERT_BEFORE(tpr, pr, pr_list); break; } if (jid == lastprid) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); free(pr, M_PRISON); prison_deref(ppr, PD_DEREF | PD_DEUREF | PD_LIST_XLOCKED); goto done_releroot; } jid++; goto findnext; } lastprid = jid; } else { /* * The jail already has a jid (that did not yet exist), * so just find where to insert it. */ TAILQ_FOREACH(tpr, &allprison, pr_list) if (tpr->pr_id >= jid) { TAILQ_INSERT_BEFORE(tpr, pr, pr_list); break; } } if (tpr == NULL) TAILQ_INSERT_TAIL(&allprison, pr, pr_list); LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; pr->pr_parent = ppr; pr->pr_id = jid; /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) namelc = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; vref(root); } strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) #ifdef VIMAGE if (!(pr_flags & PR_VNET)) #endif { #ifdef INET if (!(ch_flags & PR_IP4_USER)) pr->pr_flags |= PR_IP4 | PR_IP4_USER; else if (!(pr_flags & PR_IP4_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP4; if (ppr->pr_ip4 != NULL) { pr->pr_ip4s = ppr->pr_ip4s; pr->pr_ip4 = malloc(pr->pr_ip4s * sizeof(struct in_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip4, pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); } } #endif #ifdef INET6 if (!(ch_flags & PR_IP6_USER)) pr->pr_flags |= PR_IP6 | PR_IP6_USER; else if (!(pr_flags & PR_IP6_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP6; if (ppr->pr_ip6 != NULL) { pr->pr_ip6s = ppr->pr_ip6s; pr->pr_ip6 = malloc(pr->pr_ip6s * sizeof(struct in6_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip6, pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); } } #endif } #endif /* Source address selection is always on by default. */ pr->pr_flags |= _PR_IP_SADDRSEL; pr->pr_securelevel = ppr->pr_securelevel; pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; pr->pr_enforce_statfs = jail_default_enforce_statfs; pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; if (osrelstr == NULL) strcpy(pr->pr_osrelease, ppr->pr_osrelease); else strcpy(pr->pr_osrelease, osrelstr); LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); #ifdef VIMAGE /* Allocate a new vnet if specified. */ pr->pr_vnet = (pr_flags & PR_VNET) ? vnet_alloc() : ppr->pr_vnet; #endif /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an erorr. */ error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) { prison_deref(pr, PD_LIST_XLOCKED); goto done_releroot; } mtx_lock(&pr->pr_mtx); /* * New prisons do not yet have a reference, because we do not * want others to see the incomplete prison once the * allprison_lock is downgraded. */ } else { created = 0; /* * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ pr->pr_ref++; #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_deref_locked; } #endif #ifdef INET if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_deref_locked; } #endif #ifdef INET6 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_deref_locked; } #endif } /* Do final error checking before setting anything. */ if (gotslevel) { if (slevel < ppr->pr_securelevel) { error = EPERM; goto done_deref_locked; } } if (gotchildmax) { if (childmax >= ppr->pr_childmax) { error = EPERM; goto done_deref_locked; } } if (gotenforce) { if (enforce < ppr->pr_enforce_statfs) { error = EPERM; goto done_deref_locked; } } if (gotrsnum) { /* * devfs_rsnum is a uint16_t */ if (rsnum < 0 || rsnum > 65535) { error = EINVAL; goto done_deref_locked; } /* * Nested jails always inherit parent's devfs ruleset */ if (jailed(td->td_ucred)) { if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { error = EPERM; goto done_deref_locked; } else rsnum = ppr->pr_devfs_rsnum; } } #ifdef INET if (ip4s > 0) { if (ppr->pr_flags & PR_IP4) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. Don't worry * about the parent being unlocked, as any * setting is done with allprison_lock held. */ for (ij = 0; ij < ppr->pr_ip4s; ij++) if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref_locked; } if (ip4s > 1) { for (ii = ij = 1; ii < ip4s; ii++) { if (ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) continue; for (; ij < ppr->pr_ip4s; ij++) if (ip4[ii].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) break; } if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref_locked; } } } /* * Check for conflicting IP addresses. We permit them * if there is no more than one IP on each jail. If * there is a duplicate on a jail with more than one * IP stop checking and return error. */ tppr = ppr; #ifdef VIMAGE for (; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif tpr->pr_uref == 0) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP4_USER)) continue; descend = 0; if (tpr->pr_ip4 == NULL || (ip4s == 1 && tpr->pr_ip4s == 1)) continue; for (ii = 0; ii < ip4s; ii++) { if (prison_check_ip4_locked(tpr, &ip4[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv4 addresses clash"); goto done_deref_locked; } } } } #endif #ifdef INET6 if (ip6s > 0) { if (ppr->pr_flags & PR_IP6) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. */ for (ij = 0; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL(&ip6[0], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref_locked; } if (ip6s > 1) { for (ii = ij = 1; ii < ip6s; ii++) { if (IN6_ARE_ADDR_EQUAL(&ip6[ii], &ppr->pr_ip6[0])) continue; for (; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL( &ip6[ii], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) break; } if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref_locked; } } } /* Check for conflicting IP addresses. */ tppr = ppr; #ifdef VIMAGE for (; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif tpr->pr_uref == 0) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP6_USER)) continue; descend = 0; if (tpr->pr_ip6 == NULL || (ip6s == 1 && tpr->pr_ip6s == 1)) continue; for (ii = 0; ii < ip6s; ii++) { if (prison_check_ip6_locked(tpr, &ip6[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv6 addresses clash"); goto done_deref_locked; } } } } #endif onamelen = namelen = 0; if (namelc != NULL) { /* Give a default name of the jid. Also allow the name to be * explicitly the jid - but not any other number, and only in * normal form (no leading zero/etc). */ if (namelc[0] == '\0') snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); else if ((strtoul(namelc, &p, 10) != jid || namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { error = EINVAL; vfs_opterror(opts, "name cannot be numeric (unless it is the jid)"); goto done_deref_locked; } /* * Make sure the name isn't too long for the prison or its * children. */ pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; onamelen = strlen(pr->pr_name + pnamelen); namelen = strlen(namelc); if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { if (strlen(tpr->pr_name) + (namelen - onamelen) >= sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } } } if (pr_allow & ~ppr->pr_allow) { error = EPERM; goto done_deref_locked; } /* * Let modules check their parameters. This requires unlocking and * then re-locking the prison, but this is still a valid state as long * as allprison_lock remains xlocked. */ mtx_unlock(&pr->pr_mtx); error = osd_jail_call(pr, PR_METHOD_CHECK, opts); if (error != 0) { prison_deref(pr, created ? PD_LIST_XLOCKED : PD_DEREF | PD_LIST_XLOCKED); goto done_releroot; } mtx_lock(&pr->pr_mtx); /* At this point, all valid parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_deref_locked; } } /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = 0; if (pr_flags & PR_IP4_USER) { pr->pr_flags |= PR_IP4; free(pr->pr_ip4, M_PRISON); pr->pr_ip4s = ip4s; pr->pr_ip4 = ip4; ip4 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, NULL)) { redo_ip4 = 1; descend = 0; } } } #endif #ifdef INET6 redo_ip6 = 0; if (pr_flags & PR_IP6_USER) { pr->pr_flags |= PR_IP6; free(pr->pr_ip6, M_PRISON); pr->pr_ip6s = ip6s; pr->pr_ip6 = ip6; ip6 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, NULL)) { redo_ip6 = 1; descend = 0; } } } #endif if (gotslevel) { pr->pr_securelevel = slevel; /* Set all child jails to be at least this level. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_securelevel < slevel) tpr->pr_securelevel = slevel; } if (gotchildmax) { pr->pr_childmax = childmax; /* Set all child jails to under this limit. */ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) if (tpr->pr_childmax > childmax - level) tpr->pr_childmax = childmax > level ? childmax - level : 0; } if (gotenforce) { pr->pr_enforce_statfs = enforce; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_enforce_statfs < enforce) tpr->pr_enforce_statfs = enforce; } if (gotrsnum) { pr->pr_devfs_rsnum = rsnum; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } if (namelc != NULL) { if (ppr == &prison0) strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", ppr->pr_name, namelc); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, strlen(tpr->pr_name + onamelen) + 1); bcopy(pr->pr_name, tpr->pr_name, namelen); } } if (path != NULL) { /* Try to keep a real-rooted full pathname. */ if (fullpath_disabled && path[0] == '/' && strcmp(mypr->pr_path, "/")) snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s", mypr->pr_path, path); else strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; } if (PR_HOST & ch_flags & ~pr_flags) { if (pr->pr_flags & PR_HOST) { /* * Copy the parent's host info. As with pr_ip4 above, * the lack of a lock on the parent is not a problem; * it is always set with allprison_lock at least * shared, and is held exclusively here. */ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, sizeof(pr->pr_hostname)); strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, sizeof(pr->pr_domainname)); strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, sizeof(pr->pr_hostuuid)); pr->pr_hostid = pr->pr_parent->pr_hostid; } } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { /* Set this prison, and any descendants without PR_HOST. */ if (host != NULL) strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); if (domain != NULL) strlcpy(pr->pr_domainname, domain, sizeof(pr->pr_domainname)); if (uuid != NULL) strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); if (gothid) pr->pr_hostid = hid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { if (tpr->pr_flags & PR_HOST) descend = 0; else { if (host != NULL) strlcpy(tpr->pr_hostname, pr->pr_hostname, sizeof(tpr->pr_hostname)); if (domain != NULL) strlcpy(tpr->pr_domainname, pr->pr_domainname, sizeof(tpr->pr_domainname)); if (uuid != NULL) strlcpy(tpr->pr_hostuuid, pr->pr_hostuuid, sizeof(tpr->pr_hostuuid)); if (gothid) tpr->pr_hostid = hid; } } } if ((tallow = ch_allow & ~pr_allow)) { /* Clear allow bits in all children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_allow &= ~tallow; } pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. Only do this for existing prisons * for now, so new ones will remain unseen until after the module * handlers have completed. */ born = pr->pr_uref == 0; if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) { if (pr_flags & PR_PERSIST) { pr->pr_ref++; pr->pr_uref++; } else { pr->pr_ref--; pr->pr_uref--; } } pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); #ifdef RACCT if (racct_enable && created) prison_racct_attach(pr); #endif /* Locks may have prevented a complete restriction of child IP * addresses. If so, allocate some more memory and try again. */ #ifdef INET while (redo_ip4) { ip4s = pr->pr_ip4s; ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip4 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, ip4)) { if (ip4 != NULL) ip4 = NULL; else redo_ip4 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif #ifdef INET6 while (redo_ip6) { ip6s = pr->pr_ip6s; ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip6 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, ip6)) { if (ip6 != NULL) ip6 = NULL; else redo_ip6 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif /* Let the modules do their work. */ sx_downgrade(&allprison_lock); if (born) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) { (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); prison_deref(pr, created ? PD_LIST_SLOCKED : PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; } } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) { if (born) (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); prison_deref(pr, created ? PD_LIST_SLOCKED : PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { mtx_lock(&pr->pr_mtx); error = do_jail_attach(td, pr); if (error) { vfs_opterror(opts, "attach failed"); if (!created) prison_deref(pr, PD_DEREF); goto done_errmsg; } } #ifdef RACCT if (racct_enable && !created) { if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); prison_racct_modify(pr); if (!(flags & JAIL_ATTACH)) sx_slock(&allprison_lock); } #endif td->td_retval[0] = pr->pr_id; /* * Now that it is all there, drop the temporary reference from existing * prisons. Or add a reference to newly created persistent prisons * (which was not done earlier so that the prison would not be publicly * visible). */ if (!created) { prison_deref(pr, (flags & JAIL_ATTACH) ? PD_DEREF : PD_DEREF | PD_LIST_SLOCKED); } else { if (pr_flags & PR_PERSIST) { mtx_lock(&pr->pr_mtx); pr->pr_ref++; pr->pr_uref++; mtx_unlock(&pr->pr_mtx); } if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); } goto done_free; done_deref_locked: prison_deref(pr, created ? PD_LOCKED | PD_LIST_XLOCKED : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); goto done_releroot; done_unlock_list: sx_xunlock(&allprison_lock); done_releroot: if (root != NULL) vrele(root); done_errmsg: if (error) { if (vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len) == 0 && errmsg_len > 0) { errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } done_free: #ifdef INET free(ip4, M_PRISON); #endif #ifdef INET6 free(ip6, M_PRISON); #endif if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); return (error); } /* * struct jail_get_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_get(struct thread *td, struct jail_get_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_get(td, auio, uap->flags); if (error == 0) error = copyout(auio->uio_iov, uap->iovp, uap->iovcnt * sizeof (struct iovec)); free(auio, M_IOV); return (error); } int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos; if (flags & ~JAIL_GET_MASK) return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; /* * Find the prison specified by one of: lastjid, jid, name. */ sx_slock(&allprison_lock); error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id > jid && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0 && (pr->pr_uref > 0 || (flags & JAIL_DYING))) break; mtx_unlock(&pr->pr_mtx); } } if (pr != NULL) goto found_prison; error = ENOENT; vfs_opterror(opts, "no jail after %d", jid); goto done_unlock_list; } else if (error != ENOENT) goto done_unlock_list; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { pr = prison_find_child(mypr, jid); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_unlock_list; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_unlock_list; } } else if (error != ENOENT) goto done_unlock_list; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == 0) { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_unlock_list; } pr = prison_find_name(mypr, name); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_unlock_list; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } else if (error != ENOENT) goto done_unlock_list; vfs_opterror(opts, "no jail specified"); error = ENOENT; goto done_unlock_list; found_prison: /* Get the parameters of the prison. */ pr->pr_ref++; locked = PD_LOCKED; td->td_retval[0] = pr->pr_id; error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done_deref; i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "name", prison_name(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; #ifdef INET error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); if (error != 0 && error != ENOENT) goto done_deref; #endif #ifdef INET6 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); if (error != 0 && error != ENOENT) goto done_deref; #endif error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, sizeof(pr->pr_securelevel)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, sizeof(pr->pr_childcount)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "children.max", &pr->pr_childmax, sizeof(pr->pr_childmax)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); if (error != 0 && error != ENOENT) goto done_deref; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32 = pr->pr_hostid; error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); } else #endif error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, sizeof(pr->pr_hostid)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, sizeof(pr->pr_devfs_rsnum)); if (error != 0 && error != ENOENT) goto done_deref; for (fi = 0; fi < nitems(pr_flag_names); fi++) { if (pr_flag_names[fi] == NULL) continue; i = (pr->pr_flags & (1 << fi)) ? 1 : 0; error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) { i = pr->pr_flags & (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new); i = pr_flag_jailsys[fi].disable && (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } for (fi = 0; fi < nitems(pr_allow_names); fi++) { if (pr_allow_names[fi] == NULL) continue; i = (pr->pr_allow & (1 << fi)) ? 1 : 0; error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } i = (pr->pr_uref == 0); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, "nodying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, sizeof(pr->pr_osreldate)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done_deref; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); locked = 0; error = osd_jail_call(pr, PR_METHOD_GET, opts); if (error) goto done_deref; prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED); /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_errmsg; } } /* Write the fetched parameters back to userspace. */ error = 0; TAILQ_FOREACH(opt, opts, link) { if (opt->pos >= 0 && opt->pos != errmsg_pos) { pos = 2 * opt->pos + 1; optuio->uio_iov[pos].iov_len = opt->len; if (opt->value != NULL) { if (optuio->uio_segflg == UIO_SYSSPACE) { bcopy(opt->value, optuio->uio_iov[pos].iov_base, opt->len); } else { error = copyout(opt->value, optuio->uio_iov[pos].iov_base, opt->len); if (error) break; } } } } goto done_errmsg; done_deref: prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; done_unlock_list: sx_sunlock(&allprison_lock); done_errmsg: if (error && errmsg_pos >= 0) { vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); errmsg_pos = 2 * errmsg_pos + 1; if (errmsg_len > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } vfs_freeopts(opts); return (error); } /* * struct jail_remove_args { * int jid; * }; */ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { struct prison *pr, *cpr, *lpr, *tpr; int descend, error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } /* Remove all descendants of this prison, then remove this prison. */ pr->pr_ref++; if (!LIST_EMPTY(&pr->pr_children)) { mtx_unlock(&pr->pr_mtx); lpr = NULL; FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { mtx_lock(&cpr->pr_mtx); if (cpr->pr_ref > 0) { tpr = cpr; cpr->pr_ref++; } else { /* Already removed - do not do it again. */ tpr = NULL; } mtx_unlock(&cpr->pr_mtx); if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } lpr = tpr; } if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } mtx_lock(&pr->pr_mtx); } prison_remove_one(pr); return (0); } static void prison_remove_one(struct prison *pr) { struct proc *p; int deuref; /* If the prison was persistent, it is not anymore. */ deuref = 0; if (pr->pr_flags & PR_PERSIST) { pr->pr_ref--; deuref = PD_DEUREF; pr->pr_flags &= ~PR_PERSIST; } /* * jail_remove added a reference. If that's the only one, remove * the prison now. */ KASSERT(pr->pr_ref > 0, ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id)); if (pr->pr_ref == 1) { prison_deref(pr, deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); return; } mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); /* * Kill all processes unfortunate enough to be attached to this prison. */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred && p->p_ucred->cr_prison == pr) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* Remove the temporary reference added by jail_remove. */ prison_deref(pr, deuref | PD_DEREF); } /* * struct jail_attach_args { * int jid; * }; */ int sys_jail_attach(struct thread *td, struct jail_attach_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); /* * Start with exclusive hold on allprison_lock to ensure that a possible * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove. * But then immediately downgrade it since we don't need to stop * readers. */ sx_xlock(&allprison_lock); sx_downgrade(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } /* * Do not allow a process to attach to a prison that is not * considered to be "alive". */ if (pr->pr_uref == 0) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EINVAL); } return (do_jail_attach(td, pr)); } static int do_jail_attach(struct thread *td, struct prison *pr) { struct proc *p; struct ucred *newcred, *oldcred; int error; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ pr->pr_ref++; pr->pr_uref++; mtx_unlock(&pr->pr_mtx); /* Let modules do whatever they need to prepare for attaching. */ error = osd_jail_call(pr, PR_METHOD_ATTACH, td); if (error) { prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED); return (error); } sx_sunlock(&allprison_lock); /* * Reparent the newly attached process to this jail. */ p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root, 0); if ((error = pwd_chroot(td, pr->pr_root))) goto e_revert_osd; newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_prison = pr; proc_set_cred(p, newcred); setsugid(p); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF); crfree(oldcred); return (0); e_unlock: VOP_UNLOCK(pr->pr_root, 0); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); prison_deref(pr, PD_DEREF | PD_DEUREF); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) return (pr); mtx_unlock(&pr->pr_mtx); } } return (NULL); } /* * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * prison_find_child(struct prison *mypr, int prid) { struct prison *pr; int descend; sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) return (pr); mtx_unlock(&pr->pr_mtx); } } return (NULL); } /* * Look for the name relative to mypr. Returns a locked prison or NULL. */ struct prison * prison_find_name(struct prison *mypr, const char *name) { struct prison *pr, *deadpr; size_t mylen; int descend; sx_assert(&allprison_lock, SX_LOCKED); mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; again: deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (!strcmp(pr->pr_name + mylen, name)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) { if (pr->pr_uref > 0) return (pr); deadpr = pr; } mtx_unlock(&pr->pr_mtx); } } /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { mtx_unlock(&deadpr->pr_mtx); goto again; } } return (deadpr); } /* * See if a prison has the specific flag set. */ int prison_flag(struct ucred *cred, unsigned flag) { /* This is an atomic read, so no locking is necessary. */ return (cred->cr_prison->pr_flags & flag); } int prison_allow(struct ucred *cred, unsigned flag) { /* This is an atomic read, so no locking is necessary. */ return (cred->cr_prison->pr_allow & flag); } /* * Remove a prison reference. If that was the last reference, remove the * prison itself - but not in this context in case there are locks held. */ void prison_free_locked(struct prison *pr) { int ref; mtx_assert(&pr->pr_mtx, MA_OWNED); ref = --pr->pr_ref; mtx_unlock(&pr->pr_mtx); if (ref == 0) taskqueue_enqueue(taskqueue_thread, &pr->pr_task); } void prison_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); prison_free_locked(pr); } /* * Complete a call to either prison_free or prison_proc_free. */ static void prison_complete(void *context, int pending) { struct prison *pr = context; sx_xlock(&allprison_lock); mtx_lock(&pr->pr_mtx); prison_deref(pr, pr->pr_uref ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED : PD_LOCKED | PD_LIST_XLOCKED); } /* * Remove a prison reference (usually). This internal version assumes no * mutexes are held, except perhaps the prison itself. If there are no more * references, release and delist the prison. On completion, the prison lock * and the allprison lock are both unlocked. */ static void prison_deref(struct prison *pr, int flags) { struct prison *ppr, *tpr; int ref, lasturef; if (!(flags & PD_LOCKED)) mtx_lock(&pr->pr_mtx); for (;;) { if (flags & PD_DEUREF) { KASSERT(pr->pr_uref > 0, ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); pr->pr_uref--; lasturef = pr->pr_uref == 0; if (lasturef) pr->pr_ref++; KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0")); } else lasturef = 0; if (flags & PD_DEREF) { KASSERT(pr->pr_ref > 0, ("prison_deref PD_DEREF on a dead prison (jid=%d)", pr->pr_id)); pr->pr_ref--; } ref = pr->pr_ref; mtx_unlock(&pr->pr_mtx); /* * Tell the modules if the last user reference was removed * (even it sticks around in dying state). */ if (lasturef) { if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) { sx_xlock(&allprison_lock); flags |= PD_LIST_XLOCKED; } (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); mtx_lock(&pr->pr_mtx); ref = --pr->pr_ref; mtx_unlock(&pr->pr_mtx); } /* If the prison still has references, nothing else to do. */ if (ref > 0) { if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); return; } if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); sx_xlock(&allprison_lock); } } else if (!(flags & PD_LIST_XLOCKED)) sx_xlock(&allprison_lock); TAILQ_REMOVE(&allprison, pr, pr_list); LIST_REMOVE(pr, pr_sibling); ppr = pr->pr_parent; for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount--; sx_xunlock(&allprison_lock); #ifdef VIMAGE if (pr->pr_vnet != ppr->pr_vnet) vnet_destroy(pr->pr_vnet); #endif if (pr->pr_root != NULL) vrele(pr->pr_root); mtx_destroy(&pr->pr_mtx); #ifdef INET free(pr->pr_ip4, M_PRISON); #endif #ifdef INET6 free(pr->pr_ip6, M_PRISON); #endif if (pr->pr_cpuset != NULL) cpuset_rel(pr->pr_cpuset); osd_jail_exit(pr); #ifdef RACCT if (racct_enable) prison_racct_detach(pr); #endif free(pr, M_PRISON); /* Removing a prison frees a reference on its parent. */ pr = ppr; mtx_lock(&pr->pr_mtx); flags = PD_DEREF | PD_DEUREF; } } void prison_hold_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); KASSERT(pr->pr_ref > 0, ("Trying to hold dead prison (jid=%d).", pr->pr_id)); pr->pr_ref++; } void prison_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); } void prison_proc_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); pr->pr_uref++; mtx_unlock(&pr->pr_mtx); } void prison_proc_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); if (pr->pr_uref > 1) pr->pr_uref--; else { /* * Don't remove the last user reference in this context, which * is expected to be a process that is not only locked, but * also half dead. */ pr->pr_ref++; mtx_unlock(&pr->pr_mtx); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; } mtx_unlock(&pr->pr_mtx); } /* * Check if a jail supports the given address family. * * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT * if not. */ int prison_check_af(struct ucred *cred, int af) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (af) { #ifdef INET case AF_INET: if (pr->pr_flags & PR_IP4) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif #ifdef INET6 case AF_INET6: if (pr->pr_flags & PR_IP6) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif case AF_LOCAL: case AF_ROUTE: break; default: if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * * Returns 0 if jail doesn't restrict the address family or if address belongs * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, struct sockaddr *sa) { #ifdef INET struct sockaddr_in *sai; #endif #ifdef INET6 struct sockaddr_in6 *sai6; #endif int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (sa->sa_family) { #ifdef INET case AF_INET: sai = (struct sockaddr_in *)sa; error = prison_check_ip4(cred, &sai->sin_addr); break; #endif #ifdef INET6 case AF_INET6: sai6 = (struct sockaddr_in6 *)sa; error = prison_check_ip6(cred, &sai6->sin6_addr); break; #endif default: if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { return ((cred1->cr_prison == cred2->cr_prison || prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); } /* * Return 1 if p2 is a child of p1, otherwise 0. */ int prison_ischild(struct prison *pr1, struct prison *pr2) { for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) if (pr1 == pr2) return (1); return (0); } /* * Return 1 if the passed credential is in a jail, otherwise 0. */ int jailed(struct ucred *cred) { return (cred->cr_prison != &prison0); } /* * Return 1 if the passed credential is in a jail and that jail does not * have its own virtual network stack, otherwise 0. */ int jailed_without_vnet(struct ucred *cred) { if (!jailed(cred)) return (0); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif return (1); } /* * Return the correct hostname (domainname, et al) for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { struct prison *pr; /* * A NULL credential can be used to shortcut to the physical * system's hostname. */ pr = (cred != NULL) ? cred->cr_prison : &prison0; mtx_lock(&pr->pr_mtx); strlcpy(buf, pr->pr_hostname, size); mtx_unlock(&pr->pr_mtx); } void getcreddomainname(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_domainname, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostuuid(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_hostuuid, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostid(struct ucred *cred, unsigned long *hostid) { mtx_lock(&cred->cr_prison->pr_mtx); *hostid = cred->cr_prison->pr_hostid; mtx_unlock(&cred->cr_prison->pr_mtx); } #ifdef VIMAGE /* * Determine whether the prison represented by cred owns * its vnet rather than having it inherited. * * Returns 1 in case the prison owns the vnet, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } #endif /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return (0); if (pr->pr_root->v_mount == mp) return (0); if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { if (!jailed(cred)) return (0); #ifdef VIMAGE /* * Privileges specific to prisons with a virtual network stack. * There might be a duplicate entry here in case the privilege * is only granted conditionally in the legacy jail case. */ switch (priv) { #ifdef notyet /* * NFS-specific privileges. */ case PRIV_NFS_DAEMON: case PRIV_NFS_LOCKD: #endif /* * Network stack privileges. */ case PRIV_NET_BRIDGE: case PRIV_NET_GRE: case PRIV_NET_BPF: case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ case PRIV_NET_ROUTE: case PRIV_NET_TAP: case PRIV_NET_SETIFMTU: case PRIV_NET_SETIFFLAGS: case PRIV_NET_SETIFCAP: case PRIV_NET_SETIFDESCR: case PRIV_NET_SETIFNAME : case PRIV_NET_SETIFMETRIC: case PRIV_NET_SETIFPHYS: case PRIV_NET_SETIFMAC: case PRIV_NET_ADDMULTI: case PRIV_NET_DELMULTI: case PRIV_NET_HWIOCTL: case PRIV_NET_SETLLADDR: case PRIV_NET_ADDIFGROUP: case PRIV_NET_DELIFGROUP: case PRIV_NET_IFCREATE: case PRIV_NET_IFDESTROY: case PRIV_NET_ADDIFADDR: case PRIV_NET_DELIFADDR: case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: case PRIV_NET_SETIFFIB: /* * 802.11-related privileges. */ case PRIV_NET80211_GETKEY: #ifdef notyet case PRIV_NET80211_MANAGE: /* XXX-BZ discuss with sam@ */ #endif #ifdef notyet /* * ATM privileges. */ case PRIV_NETATM_CFG: case PRIV_NETATM_ADD: case PRIV_NETATM_DEL: case PRIV_NETATM_SET: /* * Bluetooth privileges. */ case PRIV_NETBLUETOOTH_RAW: #endif /* * Netgraph and netgraph module privileges. */ case PRIV_NETGRAPH_CONTROL: #ifdef notyet case PRIV_NETGRAPH_TTY: #endif /* * IPv4 and IPv6 privileges. */ case PRIV_NETINET_IPFW: case PRIV_NETINET_DIVERT: case PRIV_NETINET_PF: case PRIV_NETINET_DUMMYNET: case PRIV_NETINET_CARP: case PRIV_NETINET_MROUTE: case PRIV_NETINET_RAW: case PRIV_NETINET_ADDRCTRL6: case PRIV_NETINET_ND6: case PRIV_NETINET_SCOPE6: case PRIV_NETINET_ALIFETIME6: case PRIV_NETINET_IPSEC: case PRIV_NETINET_BINDANY: #ifdef notyet /* * NCP privileges. */ case PRIV_NETNCP: /* * SMB privileges. */ case PRIV_NETSMB: #endif /* * No default: or deny here. * In case of no permit fall through to next switch(). */ if (cred->cr_prison->pr_flags & PR_VNET) return (0); } #endif /* VIMAGE */ switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: #if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail operations within a jail work on child jails. */ case PRIV_JAIL_ATTACH: case PRIV_JAIL_SET: case PRIV_JAIL_REMOVE: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_LOOKUP: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: /* * As in the non-jail case, non-root users are expected to be * able to read kernel/phyiscal memory (provided /dev/[k]mem * exists in the jail and they have permission to access it). */ case PRIV_KMEM_READ: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT && cred->cr_prison->pr_enforce_statfs < 2) return (0); else return (EPERM); /* * Conditionally allow jailed root to bind reserved ports. */ case PRIV_NETINET_RESERVEDPORT: if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS) return (0); else return (EPERM); /* * Allow jailed root to reuse in-use ports. */ case PRIV_NETINET_REUSEPORT: return (0); /* * Allow jailed root to set certain IPv4/6 (option) headers. */ case PRIV_NETINET_SETHDROPTS: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); /* * Allow jailed root to set loginclass. */ case PRIV_PROC_SETLOGINCLASS: return (0); default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Return the part of pr2's name that is relative to pr1, or the whole name * if it does not directly follow. */ char * prison_name(struct prison *pr1, struct prison *pr2) { char *name; /* Jails see themselves as "0" (if they see themselves at all). */ if (pr1 == pr2) return "0"; name = pr2->pr_name; if (prison_ischild(pr1, pr2)) { /* * pr1 isn't locked (and allprison_lock may not be either) * so its length can't be counted on. But the number of dots * can be counted on - and counted. */ for (; pr1 != &prison0; pr1 = pr1->pr_parent) name = strchr(name, '.') + 1; } return (name); } /* * Return the part of pr2's path that is relative to pr1, or the whole path * if it does not directly follow. */ static char * prison_path(struct prison *pr1, struct prison *pr2) { char *path1, *path2; int len1; path1 = pr1->pr_path; path2 = pr2->pr_path; if (!strcmp(path1, "/")) return (path2); len1 = strlen(path1); if (strncmp(path1, path2, len1)) return (path2); if (path2[len1] == '\0') return "/"; if (path2[len1] == '/') return (path2 + len1); return (path2); } /* * Jail-related sysctls. */ static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0, "Jails"); static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; #endif #ifdef INET6 struct in6_addr *ip6 = NULL; int ip6s = 0; #endif int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { #if defined(INET) || defined(INET6) again: #endif mtx_lock(&cpr->pr_mtx); #ifdef INET if (cpr->pr_ip4s > 0) { if (ip4s < cpr->pr_ip4s) { ip4s = cpr->pr_ip4s; mtx_unlock(&cpr->pr_mtx); ip4 = realloc(ip4, ip4s * sizeof(struct in_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip4, ip4, cpr->pr_ip4s * sizeof(struct in_addr)); } #endif #ifdef INET6 if (cpr->pr_ip6s > 0) { if (ip6s < cpr->pr_ip6s) { ip6s = cpr->pr_ip6s; mtx_unlock(&cpr->pr_mtx); ip6 = realloc(ip6, ip6s * sizeof(struct in6_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip6, ip6, cpr->pr_ip6s * sizeof(struct in6_addr)); } #endif if (cpr->pr_ref == 0) { mtx_unlock(&cpr->pr_mtx); continue; } bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; xp->pr_state = cpr->pr_uref > 0 ? PRISON_STATE_ALIVE : PRISON_STATE_DYING; strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET xp->pr_ip4s = cpr->pr_ip4s; #endif #ifdef INET6 xp->pr_ip6s = cpr->pr_ip6s; #endif mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; #ifdef INET if (xp->pr_ip4s > 0) { error = SYSCTL_OUT(req, ip4, xp->pr_ip4s * sizeof(struct in_addr)); if (error) break; } #endif #ifdef INET6 if (xp->pr_ip6s > 0) { error = SYSCTL_OUT(req, ip6, xp->pr_ip6s * sizeof(struct in6_addr)); if (error) break; } #endif } sx_sunlock(&allprison_lock); free(xp, M_TEMP); #ifdef INET free(ip4, M_TEMP); #endif #ifdef INET6 free(ip6, M_TEMP); #endif return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); static int sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) { int error, havevnet; #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; havevnet = jailed(cred) && prison_owns_vnet(cred); #else havevnet = 0; #endif error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, vnet, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns VNET?"); #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family (deprecated)"); #endif /* * Default parameters for jail(2) compatibility. For historical reasons, * the sysctl names have varying similarity to the parameter names. Prisons * just see their own parameters, and can't change them. */ static int sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) { struct prison *pr; int allow, error, i; pr = req->td->td_ucred->cr_prison; allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow; /* Get the current flag value, and convert it to a boolean. */ i = (allow & arg2) ? 1 : 0; if (arg1 != NULL) i = !i; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); i = i ? arg2 : 0; if (arg1 != NULL) i ^= arg2; /* * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 * for writing. */ mtx_lock(&prison0.pr_mtx); jail_default_allow = (jail_default_allow & ~arg2) | i; mtx_unlock(&prison0.pr_mtx); return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", "Processes in jail can set their hostnames (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", "Processes in jail can use System V IPC primitives (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", "Prison root can create raw sockets (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", "Processes in jail can alter system file flags (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the devfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the fdescfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the nullfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the procfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the linprocfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the linsysfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the tmpfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the zfs file system (deprecated)"); static int sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) { struct prison *pr; int level, error; pr = req->td->td_ucred->cr_prison; level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); *(int *)arg1 = level; return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), sysctl_jail_default_level, "I", "Processes in jail cannot see all mounted file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), sysctl_jail_default_level, "I", "Ruleset for the devfs filesystem in jail (deprecated)"); /* * Nodes to describe jail parameters. Maximum length of string parameters * is returned in the string itself, and the other parameters exist merely * to make themselves and their types known. */ SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0, "Jail parameters"); int sysctl_jail_param(SYSCTL_HANDLER_ARGS) { int i; long l; size_t s; char numbuf[12]; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_LONG: case CTLTYPE_ULONG: l = 0; #ifdef SCTL_MASK32 if (!(req->flags & SCTL_MASK32)) #endif return (SYSCTL_OUT(req, &l, sizeof(l))); case CTLTYPE_INT: case CTLTYPE_UINT: i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: s = (size_t)arg2; return (SYSCTL_OUT(req, &s, sizeof(s))); } return (0); } /* * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail secure level"); SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail value for kern.osreldate and uname -K"); SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, "Jail value for kern.osrelease and uname -r"); SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail cannot see all mounted file systems"); SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, "I", "Ruleset for in-jail devfs mounts"); SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail persistence"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, "I", "Current number of child jails"); SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, "I", "Maximum number of child jails"); SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail hostname"); SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail NIS domainname"); SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, "Jail host UUID"); SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, "LU", "Jail host ID"); SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, "Jail IPv4 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), "S,in_addr,a", "Jail IPv4 addresses"); SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv4 source address selection rather than the " "primary jail IPv4 address."); #endif #ifdef INET6 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, "Jail IPv6 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), "S,in6_addr,a", "Jail IPv6 addresses"); SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv6 source address selection rather than the " "primary jail IPv6 address."); #endif SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set hostname"); SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may use SYSV IPC"); SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create raw sockets"); SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may alter system file flags"); SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set file quotas"); SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may bind sockets to reserved ports"); SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount/unmount jail-friendly file systems in general"); SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the devfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the fdescfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the nullfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the procfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the linprocfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the linsysfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the tmpfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the zfs file system"); #ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_slock(&allprison_lock); if (pre != NULL) (pre)(); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); if (post != NULL) (post)(); sx_sunlock(&allprison_lock); } static struct prison_racct * prison_racct_find_locked(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) return (NULL); LIST_FOREACH(prr, &allprison_racct, prr_next) { if (strcmp(name, prr->prr_name) != 0) continue; /* Found prison_racct with a matching name? */ prison_racct_hold(prr); return (prr); } /* Add new prison_racct. */ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); racct_create(&prr->prr_racct); strcpy(prr->prr_name, name); refcount_init(&prr->prr_refcount, 1); LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); return (prr); } struct prison_racct * prison_racct_find(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); return (prr); } void prison_racct_hold(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); refcount_acquire(&prr->prr_refcount); } static void prison_racct_free_locked(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { racct_destroy(&prr->prr_racct); LIST_REMOVE(prr, prr_next); free(prr, M_PRISON_RACCT); } } void prison_racct_free(struct prison_racct *prr) { int old; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); old = prr->prr_refcount; if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1)) return; sx_xlock(&allprison_lock); prison_racct_free_locked(prr); sx_xunlock(&allprison_lock); } static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); KASSERT(prr != NULL, ("cannot find prison_racct")); pr->pr_prison_racct = prr; } /* * Handle jail renaming. From the racct point of view, renaming means * moving from one prison_racct to another. */ static void prison_racct_modify(struct prison *pr) { struct proc *p; struct ucred *cred; struct prison_racct *oldprr; ASSERT_RACCT_ENABLED(); sx_slock(&allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); sx_sunlock(&allproc_lock); return; } oldprr = pr->pr_prison_racct; pr->pr_prison_racct = NULL; prison_racct_attach(pr); /* * Move resource utilisation records. */ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); /* * Force rctl to reattach rules to processes. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); PROC_UNLOCK(p); racct_proc_ucred_changed(p, cred, cred); crfree(cred); } sx_sunlock(&allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } static void prison_racct_detach(struct prison *pr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } #endif /* RACCT */ #ifdef DDB static void db_show_prison(struct prison *pr) { int fi; #if defined(INET) || defined(INET6) int ii; #endif unsigned jsf; #ifdef INET char ip4buf[INET_ADDRSTRLEN]; #endif #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" path = %s\n", pr->pr_path); db_printf(" cpuset = %d\n", pr->pr_cpuset ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); db_printf(" children.max = %d\n", pr->pr_childmax); db_printf(" children.cur = %d\n", pr->pr_childcount); db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = 0x%x", pr->pr_flags); for (fi = 0; fi < nitems(pr_flag_names); fi++) if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi))) db_printf(" %s", pr_flag_names[fi]); for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) { jsf = pr->pr_flags & (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new); db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name, pr_flag_jailsys[fi].disable && (jsf == pr_flag_jailsys[fi].disable) ? "disable" : (jsf == pr_flag_jailsys[fi].new) ? "new" : "inherit"); } db_printf(" allow = 0x%x", pr->pr_allow); for (fi = 0; fi < nitems(pr_allow_names); fi++) if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi))) db_printf(" %s", pr_allow_names[fi]); db_printf("\n"); db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_hostname); db_printf(" host.domainname = %s\n", pr->pr_domainname); db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); db_printf(" host.hostid = %lu\n", pr->pr_hostid); #ifdef INET db_printf(" ip4s = %d\n", pr->pr_ip4s); for (ii = 0; ii < pr->pr_ip4s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip4.addr =" : " ", inet_ntoa_r(pr->pr_ip4[ii], ip4buf)); #endif #ifdef INET6 db_printf(" ip6s = %d\n", pr->pr_ip6s); for (ii = 0; ii < pr->pr_ip6s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip6.addr =" : " ", ip6_sprintf(ip6buf, &pr->pr_ip6[ii])); #endif } DB_SHOW_COMMAND(prison, db_show_prison_command) { struct prison *pr; if (!have_addr) { /* * Show all prisons in the list, and prison0 which is not * listed. */ db_show_prison(&prison0); if (!db_pager_quit) { TAILQ_FOREACH(pr, &allprison, pr_list) { db_show_prison(pr); if (db_pager_quit) break; } } return; } if (addr == 0) pr = &prison0; else { /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr && pr->pr_ref > 0) break; if (pr == NULL) /* Look again, without requiring a reference. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr) break; if (pr == NULL) /* Assume address points to a valid prison. */ pr = (struct prison *)addr; } db_show_prison(pr); } #endif /* DDB */ Index: head/sys/kern/kern_khelp.c =================================================================== --- head/sys/kern/kern_khelp.c (revision 326270) +++ head/sys/kern/kern_khelp.c (revision 326271) @@ -1,372 +1,374 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010,2013 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Lawrence Stewart while studying at the Centre * for Advanced Internet Architectures, Swinburne University of Technology, * made possible in part by grants from the FreeBSD Foundation and Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include static struct rwlock khelp_list_lock; RW_SYSINIT(khelplistlock, &khelp_list_lock, "helper list lock"); static TAILQ_HEAD(helper_head, helper) helpers = TAILQ_HEAD_INITIALIZER(helpers); /* Private function prototypes. */ static inline void khelp_remove_osd(struct helper *h, struct osd *hosd); void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags); #define KHELP_LIST_WLOCK() rw_wlock(&khelp_list_lock) #define KHELP_LIST_WUNLOCK() rw_wunlock(&khelp_list_lock) #define KHELP_LIST_RLOCK() rw_rlock(&khelp_list_lock) #define KHELP_LIST_RUNLOCK() rw_runlock(&khelp_list_lock) #define KHELP_LIST_LOCK_ASSERT() rw_assert(&khelp_list_lock, RA_LOCKED) int khelp_register_helper(struct helper *h) { struct helper *tmph; int error, i, inserted; error = inserted = 0; refcount_init(&h->h_refcount, 0); h->h_id = osd_register(OSD_KHELP, NULL, NULL); /* It's only safe to add the hooks after osd_register(). */ for (i = 0; i < h->h_nhooks && !error; i++) { /* We don't require the module to assign hook_helper. */ h->h_hooks[i].hook_helper = h; error = hhook_add_hook_lookup(&h->h_hooks[i], HHOOK_WAITOK); if (error) printf("%s: \"%s\" khelp module unable to " "hook type %d id %d due to error %d\n", __func__, h->h_name, h->h_hooks[i].hook_type, h->h_hooks[i].hook_id, error); } if (error) { for (i--; i >= 0; i--) hhook_remove_hook_lookup(&h->h_hooks[i]); osd_deregister(OSD_KHELP, h->h_id); } else { KHELP_LIST_WLOCK(); /* * Keep list of helpers sorted in descending h_id order. Due to * the way osd_set() works, a sorted list ensures * khelp_init_osd() will operate with improved efficiency. */ TAILQ_FOREACH(tmph, &helpers, h_next) { if (tmph->h_id < h->h_id) { TAILQ_INSERT_BEFORE(tmph, h, h_next); inserted = 1; break; } } if (!inserted) TAILQ_INSERT_TAIL(&helpers, h, h_next); KHELP_LIST_WUNLOCK(); } return (error); } int khelp_deregister_helper(struct helper *h) { struct helper *tmph; int error, i; KHELP_LIST_WLOCK(); if (h->h_refcount > 0) error = EBUSY; else { error = ENOENT; TAILQ_FOREACH(tmph, &helpers, h_next) { if (tmph == h) { TAILQ_REMOVE(&helpers, h, h_next); error = 0; break; } } } KHELP_LIST_WUNLOCK(); if (!error) { for (i = 0; i < h->h_nhooks; i++) hhook_remove_hook_lookup(&h->h_hooks[i]); osd_deregister(OSD_KHELP, h->h_id); } return (error); } int khelp_init_osd(uint32_t classes, struct osd *hosd) { struct helper *h; void *hdata; int error; KASSERT(hosd != NULL, ("struct osd not initialised!")); error = 0; KHELP_LIST_RLOCK(); TAILQ_FOREACH(h, &helpers, h_next) { /* If helper is correct class and needs to store OSD... */ if (h->h_classes & classes && h->h_flags & HELPER_NEEDS_OSD) { hdata = uma_zalloc(h->h_zone, M_NOWAIT); if (hdata == NULL) { error = ENOMEM; break; } osd_set(OSD_KHELP, hosd, h->h_id, hdata); refcount_acquire(&h->h_refcount); } } if (error) { /* Delete OSD that was assigned prior to the error. */ TAILQ_FOREACH(h, &helpers, h_next) { if (h->h_classes & classes) khelp_remove_osd(h, hosd); } } KHELP_LIST_RUNLOCK(); return (error); } int khelp_destroy_osd(struct osd *hosd) { struct helper *h; int error; KASSERT(hosd != NULL, ("struct osd not initialised!")); error = 0; KHELP_LIST_RLOCK(); /* * Clean up all khelp related OSD. * * XXXLAS: Would be nice to use something like osd_exit() here but it * doesn't have the right semantics for this purpose. */ TAILQ_FOREACH(h, &helpers, h_next) khelp_remove_osd(h, hosd); KHELP_LIST_RUNLOCK(); return (error); } static inline void khelp_remove_osd(struct helper *h, struct osd *hosd) { void *hdata; if (h->h_flags & HELPER_NEEDS_OSD) { /* * If the current helper uses OSD and calling osd_get() * on the helper's h_id returns non-NULL, the helper has * OSD attached to 'hosd' which needs to be cleaned up. */ hdata = osd_get(OSD_KHELP, hosd, h->h_id); if (hdata != NULL) { uma_zfree(h->h_zone, hdata); osd_del(OSD_KHELP, hosd, h->h_id); refcount_release(&h->h_refcount); } } } void * khelp_get_osd(struct osd *hosd, int32_t id) { return (osd_get(OSD_KHELP, hosd, id)); } int32_t khelp_get_id(char *hname) { struct helper *h; int32_t id; id = -1; KHELP_LIST_RLOCK(); TAILQ_FOREACH(h, &helpers, h_next) { if (strncmp(h->h_name, hname, HELPER_NAME_MAXLEN) == 0) { id = h->h_id; break; } } KHELP_LIST_RUNLOCK(); return (id); } int khelp_add_hhook(struct hookinfo *hki, uint32_t flags) { int error; /* * XXXLAS: Should probably include the functionality to update the * helper's h_hooks struct member. */ error = hhook_add_hook_lookup(hki, flags); return (error); } int khelp_remove_hhook(struct hookinfo *hki) { int error; /* * XXXLAS: Should probably include the functionality to update the * helper's h_hooks struct member. */ error = hhook_remove_hook_lookup(hki); return (error); } /* * Private KPI between hhook and khelp that allows khelp modules to insert hook * functions into hhook points which register after the modules were loaded. */ void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags) { struct helper *h; int error, i; KHELP_LIST_RLOCK(); TAILQ_FOREACH(h, &helpers, h_next) { for (i = 0; i < h->h_nhooks; i++) { if (hhh->hhh_type != h->h_hooks[i].hook_type || hhh->hhh_id != h->h_hooks[i].hook_id) continue; error = hhook_add_hook(hhh, &h->h_hooks[i], flags); if (error) { printf("%s: \"%s\" khelp module unable to " "hook type %d id %d due to error %d\n", __func__, h->h_name, h->h_hooks[i].hook_type, h->h_hooks[i].hook_id, error); error = 0; } } } KHELP_LIST_RUNLOCK(); } int khelp_modevent(module_t mod, int event_type, void *data) { struct khelp_modevent_data *kmd; int error; kmd = (struct khelp_modevent_data *)data; error = 0; switch(event_type) { case MOD_LOAD: if (kmd->helper->h_flags & HELPER_NEEDS_OSD) { if (kmd->uma_zsize <= 0) { printf("Use KHELP_DECLARE_MOD_UMA() instead!\n"); error = EDOOFUS; break; } kmd->helper->h_zone = uma_zcreate(kmd->name, kmd->uma_zsize, kmd->umactor, kmd->umadtor, NULL, NULL, 0, 0); if (kmd->helper->h_zone == NULL) { error = ENOMEM; break; } } strlcpy(kmd->helper->h_name, kmd->name, HELPER_NAME_MAXLEN); kmd->helper->h_hooks = kmd->hooks; kmd->helper->h_nhooks = kmd->nhooks; if (kmd->helper->mod_init != NULL) error = kmd->helper->mod_init(); if (!error) error = khelp_register_helper(kmd->helper); break; case MOD_QUIESCE: case MOD_SHUTDOWN: case MOD_UNLOAD: error = khelp_deregister_helper(kmd->helper); if (!error) { if (kmd->helper->h_flags & HELPER_NEEDS_OSD) uma_zdestroy(kmd->helper->h_zone); if (kmd->helper->mod_destroy != NULL) kmd->helper->mod_destroy(); } else if (error == ENOENT) /* Do nothing and allow unload if helper not in list. */ error = 0; else if (error == EBUSY) printf("Khelp module \"%s\" can't unload until its " "refcount drops from %d to 0.\n", kmd->name, kmd->helper->h_refcount); break; default: error = EINVAL; break; } return (error); } Index: head/sys/kern/kern_kthread.c =================================================================== --- head/sys/kern/kern_kthread.c (revision 326270) +++ head/sys/kern/kern_kthread.c (revision 326271) @@ -1,470 +1,472 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Start a kernel process. This is called after a fork() call in * mi_startup() in the file kern/init_main.c. * * This function is used to start "internal" daemons and intended * to be called from SYSINIT(). */ void kproc_start(const void *udata) { const struct kproc_desc *kp = udata; int error; error = kproc_create((void (*)(void *))kp->func, NULL, kp->global_procpp, 0, 0, "%s", kp->arg0); if (error) panic("kproc_start: %s: error %d", kp->arg0, error); } /* * Create a kernel process/thread/whatever. It shares its address space * with proc0 - ie: kernel only. * * func is the function to start. * arg is the parameter to pass to function on first startup. * newpp is the return value pointing to the thread's struct proc. * flags are flags to fork1 (in unistd.h) * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.). */ int kproc_create(void (*func)(void *), void *arg, struct proc **newpp, int flags, int pages, const char *fmt, ...) { struct fork_req fr; int error; va_list ap; struct thread *td; struct proc *p2; if (!proc0.p_stats) panic("kproc_create called too soon"); bzero(&fr, sizeof(fr)); fr.fr_flags = RFMEM | RFFDG | RFPROC | RFSTOPPED | flags; fr.fr_pages = pages; fr.fr_procp = &p2; error = fork1(&thread0, &fr); if (error) return error; /* save a global descriptor, if desired */ if (newpp != NULL) *newpp = p2; /* this is a non-swapped system process */ PROC_LOCK(p2); td = FIRST_THREAD_IN_PROC(p2); p2->p_flag |= P_SYSTEM | P_KPROC; td->td_pflags |= TDP_KTHREAD; mtx_lock(&p2->p_sigacts->ps_mtx); p2->p_sigacts->ps_flag |= PS_NOCLDWAIT; mtx_unlock(&p2->p_sigacts->ps_mtx); PROC_UNLOCK(p2); /* set up arg0 for 'ps', et al */ va_start(ap, fmt); vsnprintf(p2->p_comm, sizeof(p2->p_comm), fmt, ap); va_end(ap); /* set up arg0 for 'ps', et al */ va_start(ap, fmt); vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap); va_end(ap); #ifdef KTR sched_clear_tdname(td); #endif /* call the processes' main()... */ cpu_fork_kthread_handler(td, func, arg); /* Avoid inheriting affinity from a random parent. */ cpuset_setthread(td->td_tid, cpuset_root); thread_lock(td); TD_SET_CAN_RUN(td); sched_prio(td, PVM); sched_user_prio(td, PUSER); /* Delay putting it on the run queue until now. */ if (!(flags & RFSTOPPED)) sched_add(td, SRQ_BORING); thread_unlock(td); return 0; } void kproc_exit(int ecode) { struct thread *td; struct proc *p; td = curthread; p = td->td_proc; /* * Reparent curthread from proc0 to init so that the zombie * is harvested. */ sx_xlock(&proctree_lock); PROC_LOCK(p); proc_reparent(p, initproc); PROC_UNLOCK(p); sx_xunlock(&proctree_lock); /* * Wakeup anyone waiting for us to exit. */ wakeup(p); /* Buh-bye! */ exit1(td, ecode, 0); } /* * Advise a kernel process to suspend (or resume) in its main loop. * Participation is voluntary. */ int kproc_suspend(struct proc *p, int timo) { /* * Make sure this is indeed a system process and we can safely * use the p_siglist field. */ PROC_LOCK(p); if ((p->p_flag & P_KPROC) == 0) { PROC_UNLOCK(p); return (EINVAL); } SIGADDSET(p->p_siglist, SIGSTOP); wakeup(p); return msleep(&p->p_siglist, &p->p_mtx, PPAUSE | PDROP, "suspkp", timo); } int kproc_resume(struct proc *p) { /* * Make sure this is indeed a system process and we can safely * use the p_siglist field. */ PROC_LOCK(p); if ((p->p_flag & P_KPROC) == 0) { PROC_UNLOCK(p); return (EINVAL); } SIGDELSET(p->p_siglist, SIGSTOP); PROC_UNLOCK(p); wakeup(&p->p_siglist); return (0); } void kproc_suspend_check(struct proc *p) { PROC_LOCK(p); while (SIGISMEMBER(p->p_siglist, SIGSTOP)) { wakeup(&p->p_siglist); msleep(&p->p_siglist, &p->p_mtx, PPAUSE, "kpsusp", 0); } PROC_UNLOCK(p); } /* * Start a kernel thread. * * This function is used to start "internal" daemons and intended * to be called from SYSINIT(). */ void kthread_start(const void *udata) { const struct kthread_desc *kp = udata; int error; error = kthread_add((void (*)(void *))kp->func, NULL, NULL, kp->global_threadpp, 0, 0, "%s", kp->arg0); if (error) panic("kthread_start: %s: error %d", kp->arg0, error); } /* * Create a kernel thread. It shares its address space * with proc0 - ie: kernel only. * * func is the function to start. * arg is the parameter to pass to function on first startup. * newtdp is the return value pointing to the thread's struct thread. * ** XXX fix this --> flags are flags to fork1 (in unistd.h) * fmt and following will be *printf'd into (*newtd)->td_name (for ps, etc.). */ int kthread_add(void (*func)(void *), void *arg, struct proc *p, struct thread **newtdp, int flags, int pages, const char *fmt, ...) { va_list ap; struct thread *newtd, *oldtd; if (!proc0.p_stats) panic("kthread_add called too soon"); /* If no process supplied, put it on proc0 */ if (p == NULL) p = &proc0; /* Initialize our new td */ newtd = thread_alloc(pages); if (newtd == NULL) return (ENOMEM); PROC_LOCK(p); oldtd = FIRST_THREAD_IN_PROC(p); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&oldtd->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); /* set up arg0 for 'ps', et al */ va_start(ap, fmt); vsnprintf(newtd->td_name, sizeof(newtd->td_name), fmt, ap); va_end(ap); newtd->td_proc = p; /* needed for cpu_copy_thread */ /* might be further optimized for kthread */ cpu_copy_thread(newtd, oldtd); /* put the designated function(arg) as the resume context */ cpu_fork_kthread_handler(newtd, func, arg); newtd->td_pflags |= TDP_KTHREAD; thread_cow_get_proc(newtd, p); /* this code almost the same as create_thread() in kern_thr.c */ p->p_flag |= P_HADTHREADS; thread_link(newtd, p); thread_lock(oldtd); /* let the scheduler know about these things. */ sched_fork_thread(oldtd, newtd); TD_SET_CAN_RUN(newtd); thread_unlock(oldtd); PROC_UNLOCK(p); tidhash_add(newtd); /* Avoid inheriting affinity from a random parent. */ cpuset_setthread(newtd->td_tid, cpuset_root); /* Delay putting it on the run queue until now. */ if (!(flags & RFSTOPPED)) { thread_lock(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); } if (newtdp) *newtdp = newtd; return 0; } void kthread_exit(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; /* A module may be waiting for us to exit. */ wakeup(td); /* * The last exiting thread in a kernel process must tear down * the whole process. */ rw_wlock(&tidhash_lock); PROC_LOCK(p); if (p->p_numthreads == 1) { PROC_UNLOCK(p); rw_wunlock(&tidhash_lock); kproc_exit(0); } LIST_REMOVE(td, td_hash); rw_wunlock(&tidhash_lock); umtx_thread_exit(td); tdsigcleanup(td); PROC_SLOCK(p); thread_exit(); } /* * Advise a kernel process to suspend (or resume) in its main loop. * Participation is voluntary. */ int kthread_suspend(struct thread *td, int timo) { struct proc *p; p = td->td_proc; /* * td_pflags should not be read by any thread other than * curthread, but as long as this flag is invariant during the * thread's lifetime, it is OK to check its state. */ if ((td->td_pflags & TDP_KTHREAD) == 0) return (EINVAL); /* * The caller of the primitive should have already checked that the * thread is up and running, thus not being blocked by other * conditions. */ PROC_LOCK(p); thread_lock(td); td->td_flags |= TDF_KTH_SUSP; thread_unlock(td); return (msleep(&td->td_flags, &p->p_mtx, PPAUSE | PDROP, "suspkt", timo)); } /* * Resume a thread previously put asleep with kthread_suspend(). */ int kthread_resume(struct thread *td) { struct proc *p; p = td->td_proc; /* * td_pflags should not be read by any thread other than * curthread, but as long as this flag is invariant during the * thread's lifetime, it is OK to check its state. */ if ((td->td_pflags & TDP_KTHREAD) == 0) return (EINVAL); PROC_LOCK(p); thread_lock(td); td->td_flags &= ~TDF_KTH_SUSP; thread_unlock(td); wakeup(&td->td_flags); PROC_UNLOCK(p); return (0); } /* * Used by the thread to poll as to whether it should yield/sleep * and notify the caller that is has happened. */ void kthread_suspend_check(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; if ((td->td_pflags & TDP_KTHREAD) == 0) panic("%s: curthread is not a valid kthread", __func__); /* * As long as the double-lock protection is used when accessing the * TDF_KTH_SUSP flag, synchronizing the read operation via proc mutex * is fine. */ PROC_LOCK(p); while (td->td_flags & TDF_KTH_SUSP) { wakeup(&td->td_flags); msleep(&td->td_flags, &p->p_mtx, PPAUSE, "ktsusp", 0); } PROC_UNLOCK(p); } int kproc_kthread_add(void (*func)(void *), void *arg, struct proc **procptr, struct thread **tdptr, int flags, int pages, const char *procname, const char *fmt, ...) { int error; va_list ap; char buf[100]; struct thread *td; if (*procptr == NULL) { error = kproc_create(func, arg, procptr, flags, pages, "%s", procname); if (error) return (error); td = FIRST_THREAD_IN_PROC(*procptr); if (tdptr) *tdptr = td; va_start(ap, fmt); vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap); va_end(ap); #ifdef KTR sched_clear_tdname(td); #endif return (0); } va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); error = kthread_add(func, arg, *procptr, tdptr, flags, pages, "%s", buf); return (error); } Index: head/sys/kern/kern_ktr.c =================================================================== --- head/sys/kern/kern_ktr.c (revision 326270) +++ head/sys/kern/kern_ktr.c (revision 326271) @@ -1,474 +1,476 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables used by KTR and the ktr_tracepoint() * function that does the actual tracing. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktr.h" #include "opt_alq.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #ifndef KTR_BOOT_ENTRIES #define KTR_BOOT_ENTRIES 1024 #endif #ifndef KTR_ENTRIES #define KTR_ENTRIES 1024 #endif /* Limit the allocations to something manageable. */ #define KTR_ENTRIES_MAX (8 * 1024 * 1024) #ifndef KTR_MASK #define KTR_MASK (0) #endif #ifndef KTR_CPUMASK #define KTR_CPUMASK CPUSET_FSET #endif #ifndef KTR_TIME #define KTR_TIME get_cyclecount() #endif #ifndef KTR_CPU #define KTR_CPU PCPU_GET(cpuid) #endif static MALLOC_DEFINE(M_KTR, "KTR", "KTR"); FEATURE(ktr, "Kernel support for KTR kernel tracing facility"); volatile int ktr_idx = 0; uint64_t ktr_mask = KTR_MASK; uint64_t ktr_compile = KTR_COMPILE; int ktr_entries = KTR_BOOT_ENTRIES; int ktr_version = KTR_VERSION; struct ktr_entry ktr_buf_init[KTR_BOOT_ENTRIES]; struct ktr_entry *ktr_buf = ktr_buf_init; cpuset_t ktr_cpumask = CPUSET_T_INITIALIZER(KTR_CPUMASK); static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options"); SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD, &ktr_version, 0, "Version of the KTR interface"); SYSCTL_UQUAD(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD, &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel"); static int sysctl_debug_ktr_cpumask(SYSCTL_HANDLER_ARGS) { char lktr_cpumask_str[CPUSETBUFSIZ]; cpuset_t imask; int error; cpusetobj_strprint(lktr_cpumask_str, &ktr_cpumask); error = sysctl_handle_string(oidp, lktr_cpumask_str, sizeof(lktr_cpumask_str), req); if (error != 0 || req->newptr == NULL) return (error); if (cpusetobj_strscan(&imask, lktr_cpumask_str) == -1) return (EINVAL); CPU_COPY(&imask, &ktr_cpumask); return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, cpumask, CTLFLAG_RWTUN | CTLFLAG_MPSAFE | CTLTYPE_STRING, NULL, 0, sysctl_debug_ktr_cpumask, "S", "Bitmask of CPUs on which KTR logging is enabled"); static int sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS) { int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error || !req->newptr) return (error); if (clear) { bzero(ktr_buf, sizeof(*ktr_buf) * ktr_entries); ktr_idx = 0; } return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_debug_ktr_clear, "I", "Clear KTR Buffer"); /* * This is a sysctl proc so that it is serialized as !MPSAFE along with * the other ktr sysctl procs. */ static int sysctl_debug_ktr_mask(SYSCTL_HANDLER_ARGS) { uint64_t mask; int error; mask = ktr_mask; error = sysctl_handle_64(oidp, &mask, 0, req); if (error || !req->newptr) return (error); ktr_mask = mask; return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_U64 | CTLFLAG_RWTUN, 0, 0, sysctl_debug_ktr_mask, "QU", "Bitmask of KTR event classes for which logging is enabled"); #if KTR_ENTRIES > KTR_BOOT_ENTRIES /* * A simplified version of sysctl_debug_ktr_entries. * No need to care about SMP, scheduling, etc. */ static void ktr_entries_initializer(void *dummy __unused) { uint64_t mask; /* Temporarily disable ktr in case malloc() is being traced. */ mask = ktr_mask; ktr_mask = 0; ktr_buf = malloc(sizeof(*ktr_buf) * KTR_ENTRIES, M_KTR, M_WAITOK | M_ZERO); memcpy(ktr_buf, ktr_buf_init + ktr_idx, (KTR_BOOT_ENTRIES - ktr_idx) * sizeof(*ktr_buf)); if (ktr_idx != 0) { memcpy(ktr_buf + KTR_BOOT_ENTRIES - ktr_idx, ktr_buf_init, ktr_idx * sizeof(*ktr_buf)); ktr_idx = KTR_BOOT_ENTRIES; } ktr_entries = KTR_ENTRIES; ktr_mask = mask; } SYSINIT(ktr_entries_initializer, SI_SUB_KMEM, SI_ORDER_ANY, ktr_entries_initializer, NULL); #endif static int sysctl_debug_ktr_entries(SYSCTL_HANDLER_ARGS) { uint64_t mask; int entries, error; struct ktr_entry *buf, *oldbuf; entries = ktr_entries; error = sysctl_handle_int(oidp, &entries, 0, req); if (error || !req->newptr) return (error); if (entries > KTR_ENTRIES_MAX) return (ERANGE); /* Disable ktr temporarily. */ mask = ktr_mask; ktr_mask = 0; /* Wait for threads to go idle. */ if ((error = quiesce_all_cpus("ktrent", PCATCH)) != 0) { ktr_mask = mask; return (error); } if (ktr_buf != ktr_buf_init) oldbuf = ktr_buf; else oldbuf = NULL; /* Allocate a new buffer. */ buf = malloc(sizeof(*buf) * entries, M_KTR, M_WAITOK | M_ZERO); /* Install the new buffer and restart ktr. */ ktr_buf = buf; ktr_entries = entries; ktr_idx = 0; ktr_mask = mask; if (oldbuf != NULL) free(oldbuf, M_KTR); return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, entries, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_debug_ktr_entries, "I", "Number of entries in the KTR buffer"); #ifdef KTR_VERBOSE int ktr_verbose = KTR_VERBOSE; TUNABLE_INT("debug.ktr.verbose", &ktr_verbose); SYSCTL_INT(_debug_ktr, OID_AUTO, verbose, CTLFLAG_RW, &ktr_verbose, 0, ""); #endif #ifdef KTR_ALQ struct alq *ktr_alq; char ktr_alq_file[MAXPATHLEN] = "/tmp/ktr.out"; int ktr_alq_cnt = 0; int ktr_alq_depth = KTR_ENTRIES; int ktr_alq_enabled = 0; int ktr_alq_failed = 0; int ktr_alq_max = 0; SYSCTL_INT(_debug_ktr, OID_AUTO, alq_max, CTLFLAG_RW, &ktr_alq_max, 0, "Maximum number of entries to write"); SYSCTL_INT(_debug_ktr, OID_AUTO, alq_cnt, CTLFLAG_RD, &ktr_alq_cnt, 0, "Current number of written entries"); SYSCTL_INT(_debug_ktr, OID_AUTO, alq_failed, CTLFLAG_RD, &ktr_alq_failed, 0, "Number of times we overran the buffer"); SYSCTL_INT(_debug_ktr, OID_AUTO, alq_depth, CTLFLAG_RW, &ktr_alq_depth, 0, "Number of items in the write buffer"); SYSCTL_STRING(_debug_ktr, OID_AUTO, alq_file, CTLFLAG_RW, ktr_alq_file, sizeof(ktr_alq_file), "KTR logging file"); static int sysctl_debug_ktr_alq_enable(SYSCTL_HANDLER_ARGS) { int error; int enable; enable = ktr_alq_enabled; error = sysctl_handle_int(oidp, &enable, 0, req); if (error || !req->newptr) return (error); if (enable) { if (ktr_alq_enabled) return (0); error = alq_open(&ktr_alq, (const char *)ktr_alq_file, req->td->td_ucred, ALQ_DEFAULT_CMODE, sizeof(struct ktr_entry), ktr_alq_depth); if (error == 0) { ktr_alq_cnt = 0; ktr_alq_failed = 0; ktr_alq_enabled = 1; } } else { if (ktr_alq_enabled == 0) return (0); ktr_alq_enabled = 0; alq_close(ktr_alq); ktr_alq = NULL; } return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, alq_enable, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_debug_ktr_alq_enable, "I", "Enable KTR logging"); #endif void ktr_tracepoint(uint64_t mask, const char *file, int line, const char *format, u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5, u_long arg6) { struct ktr_entry *entry; #ifdef KTR_ALQ struct ale *ale = NULL; #endif int newindex, saveindex; #if defined(KTR_VERBOSE) || defined(KTR_ALQ) struct thread *td; #endif int cpu; if (panicstr || kdb_active) return; if ((ktr_mask & mask) == 0 || ktr_buf == NULL) return; cpu = KTR_CPU; if (!CPU_ISSET(cpu, &ktr_cpumask)) return; #if defined(KTR_VERBOSE) || defined(KTR_ALQ) td = curthread; if (td->td_pflags & TDP_INKTR) return; td->td_pflags |= TDP_INKTR; #endif #ifdef KTR_ALQ if (ktr_alq_enabled) { if (td->td_critnest == 0 && (td->td_flags & TDF_IDLETD) == 0 && td != ald_thread) { if (ktr_alq_max && ktr_alq_cnt > ktr_alq_max) goto done; if ((ale = alq_get(ktr_alq, ALQ_NOWAIT)) == NULL) { ktr_alq_failed++; goto done; } ktr_alq_cnt++; entry = (struct ktr_entry *)ale->ae_data; } else { goto done; } } else #endif { do { saveindex = ktr_idx; newindex = (saveindex + 1) % ktr_entries; } while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0); entry = &ktr_buf[saveindex]; } entry->ktr_timestamp = KTR_TIME; entry->ktr_cpu = cpu; entry->ktr_thread = curthread; if (file != NULL) while (strncmp(file, "../", 3) == 0) file += 3; entry->ktr_file = file; entry->ktr_line = line; #ifdef KTR_VERBOSE if (ktr_verbose) { #ifdef SMP printf("cpu%d ", cpu); #endif if (ktr_verbose > 1) { printf("%s.%d\t", entry->ktr_file, entry->ktr_line); } printf(format, arg1, arg2, arg3, arg4, arg5, arg6); printf("\n"); } #endif entry->ktr_desc = format; entry->ktr_parms[0] = arg1; entry->ktr_parms[1] = arg2; entry->ktr_parms[2] = arg3; entry->ktr_parms[3] = arg4; entry->ktr_parms[4] = arg5; entry->ktr_parms[5] = arg6; #ifdef KTR_ALQ if (ktr_alq_enabled && ale) alq_post(ktr_alq, ale); done: #endif #if defined(KTR_VERBOSE) || defined(KTR_ALQ) td->td_pflags &= ~TDP_INKTR; #endif } #ifdef DDB struct tstate { int cur; int first; }; static struct tstate tstate; static int db_ktr_verbose; static int db_mach_vtrace(void); DB_SHOW_COMMAND(ktr, db_ktr_all) { tstate.cur = (ktr_idx - 1) % ktr_entries; tstate.first = -1; db_ktr_verbose = 0; db_ktr_verbose |= (strchr(modif, 'v') != NULL) ? 2 : 0; db_ktr_verbose |= (strchr(modif, 'V') != NULL) ? 1 : 0; /* just timestamp please */ if (strchr(modif, 'a') != NULL) { db_disable_pager(); while (cncheckc() == -1) if (db_mach_vtrace() == 0) break; } else { while (!db_pager_quit) if (db_mach_vtrace() == 0) break; } } static int db_mach_vtrace(void) { struct ktr_entry *kp; if (tstate.cur == tstate.first || ktr_buf == NULL) { db_printf("--- End of trace buffer ---\n"); return (0); } kp = &ktr_buf[tstate.cur]; /* Skip over unused entries. */ if (kp->ktr_desc == NULL) { db_printf("--- End of trace buffer ---\n"); return (0); } db_printf("%d (%p", tstate.cur, kp->ktr_thread); #ifdef SMP db_printf(":cpu%d", kp->ktr_cpu); #endif db_printf(")"); if (db_ktr_verbose >= 1) { db_printf(" %10.10lld", (long long)kp->ktr_timestamp); } if (db_ktr_verbose >= 2) { db_printf(" %s.%d", kp->ktr_file, kp->ktr_line); } db_printf(": "); db_printf(kp->ktr_desc, kp->ktr_parms[0], kp->ktr_parms[1], kp->ktr_parms[2], kp->ktr_parms[3], kp->ktr_parms[4], kp->ktr_parms[5]); db_printf("\n"); if (tstate.first == -1) tstate.first = tstate.cur; if (--tstate.cur < 0) tstate.cur = ktr_entries - 1; return (1); } #endif /* DDB */ Index: head/sys/kern/kern_linker.c =================================================================== --- head/sys/kern/kern_linker.c (revision 326270) +++ head/sys/kern/kern_linker.c (revision 326271) @@ -1,2235 +1,2237 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1997-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kld.h" #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include "linker_if.h" #ifdef HWPMC_HOOKS #include #endif #ifdef KLD_DEBUG int kld_debug = 0; SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RWTUN, &kld_debug, 0, "Set various levels of KLD debug"); #endif /* These variables are used by kernel debuggers to enumerate loaded files. */ const int kld_off_address = offsetof(struct linker_file, address); const int kld_off_filename = offsetof(struct linker_file, filename); const int kld_off_pathname = offsetof(struct linker_file, pathname); const int kld_off_next = offsetof(struct linker_file, link.tqe_next); /* * static char *linker_search_path(const char *name, struct mod_depend * *verinfo); */ static const char *linker_basename(const char *path); /* * Find a currently loaded file given its filename. */ static linker_file_t linker_find_file_by_name(const char* _filename); /* * Find a currently loaded file given its file id. */ static linker_file_t linker_find_file_by_id(int _fileid); /* Metadata from the static kernel */ SET_DECLARE(modmetadata_set, struct mod_metadata); MALLOC_DEFINE(M_LINKER, "linker", "kernel linker"); linker_file_t linker_kernel_file; static struct sx kld_sx; /* kernel linker lock */ /* * Load counter used by clients to determine if a linker file has been * re-loaded. This counter is incremented for each file load. */ static int loadcnt; static linker_class_list_t classes; static linker_file_list_t linker_files; static int next_file_id = 1; static int linker_no_more_classes = 0; #define LINKER_GET_NEXT_FILE_ID(a) do { \ linker_file_t lftmp; \ \ if (!cold) \ sx_assert(&kld_sx, SA_XLOCKED); \ retry: \ TAILQ_FOREACH(lftmp, &linker_files, link) { \ if (next_file_id == lftmp->id) { \ next_file_id++; \ goto retry; \ } \ } \ (a) = next_file_id; \ } while(0) /* XXX wrong name; we're looking at version provision tags here, not modules */ typedef TAILQ_HEAD(, modlist) modlisthead_t; struct modlist { TAILQ_ENTRY(modlist) link; /* chain together all modules */ linker_file_t container; const char *name; int version; }; typedef struct modlist *modlist_t; static modlisthead_t found_modules; static int linker_file_add_dependency(linker_file_t file, linker_file_t dep); static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char* name, int deps); static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp); static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo); static void linker_init(void *arg) { sx_init(&kld_sx, "kernel linker"); TAILQ_INIT(&classes); TAILQ_INIT(&linker_files); } SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0); static void linker_stop_class_add(void *arg) { linker_no_more_classes = 1; } SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL); int linker_add_class(linker_class_t lc) { /* * We disallow any class registration past SI_ORDER_ANY * of SI_SUB_KLD. We bump the reference count to keep the * ops from being freed. */ if (linker_no_more_classes == 1) return (EPERM); kobj_class_compile((kobj_class_t) lc); ((kobj_class_t)lc)->refs++; /* XXX: kobj_mtx */ TAILQ_INSERT_TAIL(&classes, lc, link); return (0); } static void linker_file_sysinit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0) return; /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_sysuninit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop, NULL) != 0) return; /* * Perform a reverse bubble sort of the system initialization objects * by their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem > (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order >= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_register_sysctls(linker_file_t lf, bool enable) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_register_sysctls: registering SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) { if (enable) sysctl_register_oid(*oidp); else sysctl_register_disabled_oid(*oidp); } sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_enable_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_enable_sysctls: enable SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_enable_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_unregister_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_unregister_sysctls: unregistering SYSCTLs" " for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_unregister_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static int linker_file_register_modules(linker_file_t lf) { struct mod_metadata **start, **stop, **mdp; const moduledata_t *moddata; int first_error, error; KLD_DPF(FILE, ("linker_file_register_modules: registering modules" " in %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "modmetadata_set", &start, &stop, NULL) != 0) { /* * This fallback should be unnecessary, but if we get booted * from boot2 instead of loader and we are missing our * metadata then we have to try the best we can. */ if (lf == linker_kernel_file) { start = SET_BEGIN(modmetadata_set); stop = SET_LIMIT(modmetadata_set); } else return (0); } first_error = 0; for (mdp = start; mdp < stop; mdp++) { if ((*mdp)->md_type != MDT_MODULE) continue; moddata = (*mdp)->md_data; KLD_DPF(FILE, ("Registering module %s in %s\n", moddata->name, lf->filename)); error = module_register(moddata, lf); if (error) { printf("Module %s failed to register: %d\n", moddata->name, error); if (first_error == 0) first_error = error; } } return (first_error); } static void linker_init_kernel_modules(void) { sx_xlock(&kld_sx); linker_file_register_modules(linker_kernel_file); sx_xunlock(&kld_sx); } SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, 0); static int linker_load_file(const char *filename, linker_file_t *result) { linker_class_t lc; linker_file_t lf; int foundfile, error, modules; /* Refuse to load modules if securelevel raised */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); lf = linker_find_file_by_name(filename); if (lf) { KLD_DPF(FILE, ("linker_load_file: file %s is already loaded," " incrementing refs\n", filename)); *result = lf; lf->refs++; return (0); } foundfile = 0; error = 0; /* * We do not need to protect (lock) classes here because there is * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY) * and there is no class deregistration mechanism at this time. */ TAILQ_FOREACH(lc, &classes, link) { KLD_DPF(FILE, ("linker_load_file: trying to load %s\n", filename)); error = LINKER_LOAD_FILE(lc, filename, &lf); /* * If we got something other than ENOENT, then it exists but * we cannot load it for some other reason. */ if (error != ENOENT) foundfile = 1; if (lf) { error = linker_file_register_modules(lf); if (error == EEXIST) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } modules = !TAILQ_EMPTY(&lf->modules); linker_file_register_sysctls(lf, false); linker_file_sysinit(lf); lf->flags |= LINKER_FILE_LINKED; /* * If all of the modules in this file failed * to load, unload the file and return an * error of ENOEXEC. */ if (modules && TAILQ_EMPTY(&lf->modules)) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (ENOEXEC); } linker_file_enable_sysctls(lf); EVENTHANDLER_INVOKE(kld_load, lf); *result = lf; return (0); } } /* * Less than ideal, but tells the user whether it failed to load or * the module was not found. */ if (foundfile) { /* * If the file type has not been recognized by the last try * printout a message before to fail. */ if (error == ENOSYS) printf("%s: %s - unsupported file type\n", __func__, filename); /* * Format not recognized or otherwise unloadable. * When loading a module that is statically built into * the kernel EEXIST percolates back up as the return * value. Preserve this so that apps like sysinstall * can recognize this special case and not post bogus * dialog boxes. */ if (error != EEXIST) error = ENOEXEC; } else error = ENOENT; /* Nothing found */ return (error); } int linker_reference_module(const char *modname, struct mod_depend *verinfo, linker_file_t *result) { modlist_t mod; int error; sx_xlock(&kld_sx); if ((mod = modlist_lookup2(modname, verinfo)) != NULL) { *result = mod->container; (*result)->refs++; sx_xunlock(&kld_sx); return (0); } error = linker_load_module(NULL, modname, NULL, verinfo, result); sx_xunlock(&kld_sx); return (error); } int linker_release_module(const char *modname, struct mod_depend *verinfo, linker_file_t lf) { modlist_t mod; int error; sx_xlock(&kld_sx); if (lf == NULL) { KASSERT(modname != NULL, ("linker_release_module: no file or name")); mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { sx_xunlock(&kld_sx); return (ESRCH); } lf = mod->container; } else KASSERT(modname == NULL && verinfo == NULL, ("linker_release_module: both file and name")); error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL); sx_xunlock(&kld_sx); return (error); } static linker_file_t linker_find_file_by_name(const char *filename) { linker_file_t lf; char *koname; koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK); sprintf(koname, "%s.ko", filename); sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) { if (strcmp(lf->filename, koname) == 0) break; if (strcmp(lf->filename, filename) == 0) break; } free(koname, M_LINKER); return (lf); } static linker_file_t linker_find_file_by_id(int fileid) { linker_file_t lf; sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED) break; return (lf); } int linker_file_foreach(linker_predicate_t *predicate, void *context) { linker_file_t lf; int retval = 0; sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { retval = predicate(lf, context); if (retval != 0) break; } sx_xunlock(&kld_sx); return (retval); } linker_file_t linker_make_file(const char *pathname, linker_class_t lc) { linker_file_t lf; const char *filename; if (!cold) sx_assert(&kld_sx, SA_XLOCKED); filename = linker_basename(pathname); KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname)); lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK); if (lf == NULL) return (NULL); lf->ctors_addr = 0; lf->ctors_size = 0; lf->refs = 1; lf->userrefs = 0; lf->flags = 0; lf->filename = strdup(filename, M_LINKER); lf->pathname = strdup(pathname, M_LINKER); LINKER_GET_NEXT_FILE_ID(lf->id); lf->ndeps = 0; lf->deps = NULL; lf->loadcnt = ++loadcnt; STAILQ_INIT(&lf->common); TAILQ_INIT(&lf->modules); TAILQ_INSERT_TAIL(&linker_files, lf, link); return (lf); } int linker_file_unload(linker_file_t file, int flags) { module_t mod, next; modlist_t ml, nextml; struct common_symbol *cp; int error, i; /* Refuse to unload modules if securelevel raised. */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs)); /* Easy case of just dropping a reference. */ if (file->refs > 1) { file->refs--; return (0); } /* Give eventhandlers a chance to prevent the unload. */ error = 0; EVENTHANDLER_INVOKE(kld_unload_try, file, &error); if (error != 0) return (EBUSY); KLD_DPF(FILE, ("linker_file_unload: file is unloading," " informing modules\n")); /* * Quiesce all the modules to give them a chance to veto the unload. */ MOD_SLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = module_getfnext(mod)) { error = module_quiesce(mod); if (error != 0 && flags != LINKER_UNLOAD_FORCE) { KLD_DPF(FILE, ("linker_file_unload: module %s" " vetoed unload\n", module_getname(mod))); /* * XXX: Do we need to tell all the quiesced modules * that they can resume work now via a new module * event? */ MOD_SUNLOCK; return (error); } } MOD_SUNLOCK; /* * Inform any modules associated with this file that they are * being unloaded. */ MOD_XLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) { next = module_getfnext(mod); MOD_XUNLOCK; /* * Give the module a chance to veto the unload. */ if ((error = module_unload(mod)) != 0) { #ifdef KLD_DEBUG MOD_SLOCK; KLD_DPF(FILE, ("linker_file_unload: module %s" " failed unload\n", module_getname(mod))); MOD_SUNLOCK; #endif return (error); } MOD_XLOCK; module_release(mod); } MOD_XUNLOCK; TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) { if (ml->container == file) { TAILQ_REMOVE(&found_modules, ml, link); free(ml, M_LINKER); } } /* * Don't try to run SYSUNINITs if we are unloaded due to a * link error. */ if (file->flags & LINKER_FILE_LINKED) { file->flags &= ~LINKER_FILE_LINKED; linker_file_unregister_sysctls(file); linker_file_sysuninit(file); } TAILQ_REMOVE(&linker_files, file, link); if (file->deps) { for (i = 0; i < file->ndeps; i++) linker_file_unload(file->deps[i], flags); free(file->deps, M_LINKER); file->deps = NULL; } while ((cp = STAILQ_FIRST(&file->common)) != NULL) { STAILQ_REMOVE_HEAD(&file->common, link); free(cp, M_LINKER); } LINKER_UNLOAD(file); EVENTHANDLER_INVOKE(kld_unload, file->filename, file->address, file->size); if (file->filename) { free(file->filename, M_LINKER); file->filename = NULL; } if (file->pathname) { free(file->pathname, M_LINKER); file->pathname = NULL; } kobj_delete((kobj_t) file, M_LINKER); return (0); } int linker_ctf_get(linker_file_t file, linker_ctf_t *lc) { return (LINKER_CTF_GET(file, lc)); } static int linker_file_add_dependency(linker_file_t file, linker_file_t dep) { linker_file_t *newdeps; sx_assert(&kld_sx, SA_XLOCKED); file->deps = realloc(file->deps, (file->ndeps + 1) * sizeof(*newdeps), M_LINKER, M_WAITOK | M_ZERO); file->deps[file->ndeps] = dep; file->ndeps++; KLD_DPF(FILE, ("linker_file_add_dependency:" " adding %s as dependency for %s\n", dep->filename, file->filename)); return (0); } /* * Locate a linker set and its contents. This is a helper function to avoid * linker_if.h exposure elsewhere. Note: firstp and lastp are really void **. * This function is used in this file so we can avoid having lots of (void **) * casts. */ int linker_file_lookup_set(linker_file_t file, const char *name, void *firstp, void *lastp, int *countp) { sx_assert(&kld_sx, SA_LOCKED); return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp)); } /* * List all functions in a file. */ int linker_file_function_listall(linker_file_t lf, linker_function_nameval_callback_t callback_func, void *arg) { return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg)); } caddr_t linker_file_lookup_symbol(linker_file_t file, const char *name, int deps) { caddr_t sym; int locked; locked = sx_xlocked(&kld_sx); if (!locked) sx_xlock(&kld_sx); sym = linker_file_lookup_symbol_internal(file, name, deps); if (!locked) sx_xunlock(&kld_sx); return (sym); } static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char *name, int deps) { c_linker_sym_t sym; linker_symval_t symval; caddr_t address; size_t common_size = 0; int i; sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n", file, name, deps)); if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) { LINKER_SYMBOL_VALUES(file, sym, &symval); if (symval.value == 0) /* * For commons, first look them up in the * dependencies and only allocate space if not found * there. */ common_size = symval.size; else { KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol" ".value=%p\n", symval.value)); return (symval.value); } } if (deps) { for (i = 0; i < file->ndeps; i++) { address = linker_file_lookup_symbol_internal( file->deps[i], name, 0); if (address) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " deps value=%p\n", address)); return (address); } } } if (common_size > 0) { /* * This is a common symbol which was not found in the * dependencies. We maintain a simple common symbol table in * the file object. */ struct common_symbol *cp; STAILQ_FOREACH(cp, &file->common, link) { if (strcmp(cp->name, name) == 0) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " old common value=%p\n", cp->address)); return (cp->address); } } /* * Round the symbol size up to align. */ common_size = (common_size + sizeof(int) - 1) & -sizeof(int); cp = malloc(sizeof(struct common_symbol) + common_size + strlen(name) + 1, M_LINKER, M_WAITOK | M_ZERO); cp->address = (caddr_t)(cp + 1); cp->name = cp->address + common_size; strcpy(cp->name, name); bzero(cp->address, common_size); STAILQ_INSERT_TAIL(&file->common, cp, link); KLD_DPF(SYM, ("linker_file_lookup_symbol: new common" " value=%p\n", cp->address)); return (cp->address); } KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n")); return (0); } /* * Both DDB and stack(9) rely on the kernel linker to provide forward and * backward lookup of symbols. However, DDB and sometimes stack(9) need to * do this in a lockfree manner. We provide a set of internal helper * routines to perform these operations without locks, and then wrappers that * optionally lock. * * linker_debug_lookup() is ifdef DDB as currently it's only used by DDB. */ #ifdef DDB static int linker_debug_lookup(const char *symstr, c_linker_sym_t *sym) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0) return (0); } return (ENOENT); } #endif static int linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { linker_file_t lf; c_linker_sym_t best, es; u_long diff, bestdiff, off; best = 0; off = (uintptr_t)value; bestdiff = off; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0) continue; if (es != 0 && diff < bestdiff) { best = es; bestdiff = diff; } if (bestdiff == 0) break; } if (best) { *sym = best; *diffp = bestdiff; return (0); } else { *sym = 0; *diffp = off; return (ENOENT); } } static int linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0) return (0); } return (ENOENT); } static int linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { linker_symval_t symval; c_linker_sym_t sym; int error; *offset = 0; error = linker_debug_search_symbol(value, &sym, offset); if (error) return (error); error = linker_debug_symbol_values(sym, &symval); if (error) return (error); strlcpy(buf, symval.name, buflen); return (0); } /* * DDB Helpers. DDB has to look across multiple files with their own symbol * tables and string tables. * * Note that we do not obey list locking protocols here. We really don't need * DDB to hang because somebody's got the lock held. We'll take the chance * that the files list is inconsistent instead. */ #ifdef DDB int linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym) { return (linker_debug_lookup(symstr, sym)); } #endif int linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { return (linker_debug_search_symbol(value, sym, diffp)); } int linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { return (linker_debug_symbol_values(sym, symval)); } int linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { return (linker_debug_search_symbol_name(value, buf, buflen, offset)); } /* * stack(9) helper for non-debugging environemnts. Unlike DDB helpers, we do * obey locking protocols, and offer a significantly less complex interface. */ int linker_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { int error; sx_slock(&kld_sx); error = linker_debug_search_symbol_name(value, buf, buflen, offset); sx_sunlock(&kld_sx); return (error); } /* * Syscalls. */ int kern_kldload(struct thread *td, const char *file, int *fileid) { const char *kldname, *modname; linker_file_t lf; int error; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); /* * It is possible that kldloaded module will attach a new ifnet, * so vnet context must be set when this ocurs. */ CURVNET_SET(TD_TO_VNET(td)); /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface * name. */ if (strchr(file, '/') || strchr(file, '.')) { kldname = file; modname = NULL; } else { kldname = NULL; modname = file; } sx_xlock(&kld_sx); error = linker_load_module(kldname, modname, NULL, NULL, &lf); if (error) { sx_xunlock(&kld_sx); goto done; } lf->userrefs++; if (fileid != NULL) *fileid = lf->id; sx_xunlock(&kld_sx); done: CURVNET_RESTORE(); return (error); } int sys_kldload(struct thread *td, struct kldload_args *uap) { char *pathname = NULL; int error, fileid; td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL); if (error == 0) { error = kern_kldload(td, pathname, &fileid); if (error == 0) td->td_retval[0] = fileid; } free(pathname, M_TEMP); return (error); } int kern_kldunload(struct thread *td, int fileid, int flags) { linker_file_t lf; int error = 0; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); CURVNET_SET(TD_TO_VNET(td)); sx_xlock(&kld_sx); lf = linker_find_file_by_id(fileid); if (lf) { KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs)); if (lf->userrefs == 0) { /* * XXX: maybe LINKER_UNLOAD_FORCE should override ? */ printf("kldunload: attempt to unload file that was" " loaded by the kernel\n"); error = EBUSY; } else { lf->userrefs--; error = linker_file_unload(lf, flags); if (error) lf->userrefs++; } } else error = ENOENT; sx_xunlock(&kld_sx); CURVNET_RESTORE(); return (error); } int sys_kldunload(struct thread *td, struct kldunload_args *uap) { return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL)); } int sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap) { if (uap->flags != LINKER_UNLOAD_NORMAL && uap->flags != LINKER_UNLOAD_FORCE) return (EINVAL); return (kern_kldunload(td, uap->fileid, uap->flags)); } int sys_kldfind(struct thread *td, struct kldfind_args *uap) { char *pathname; const char *filename; linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0) goto out; filename = linker_basename(pathname); sx_xlock(&kld_sx); lf = linker_find_file_by_name(filename); if (lf) td->td_retval[0] = lf->id; else error = ENOENT; sx_xunlock(&kld_sx); out: free(pathname, M_TEMP); return (error); } int sys_kldnext(struct thread *td, struct kldnext_args *uap) { linker_file_t lf; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); if (uap->fileid == 0) lf = TAILQ_FIRST(&linker_files); else { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) { error = ENOENT; goto out; } lf = TAILQ_NEXT(lf, link); } /* Skip partially loaded files. */ while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED)) lf = TAILQ_NEXT(lf, link); if (lf) td->td_retval[0] = lf->id; else td->td_retval[0] = 0; out: sx_xunlock(&kld_sx); return (error); } int sys_kldstat(struct thread *td, struct kldstat_args *uap) { struct kld_file_stat *stat; int error, version; /* * Check the version of the user's structure. */ if ((error = copyin(&uap->stat->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct kld_file_stat_1) && version != sizeof(struct kld_file_stat)) return (EINVAL); stat = malloc(sizeof(*stat), M_TEMP, M_WAITOK | M_ZERO); error = kern_kldstat(td, uap->fileid, stat); if (error == 0) error = copyout(stat, uap->stat, version); free(stat, M_TEMP); return (error); } int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat) { linker_file_t lf; int namelen; #ifdef MAC int error; error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(fileid); if (lf == NULL) { sx_xunlock(&kld_sx); return (ENOENT); } /* Version 1 fields: */ namelen = strlen(lf->filename) + 1; if (namelen > sizeof(stat->name)) namelen = sizeof(stat->name); bcopy(lf->filename, &stat->name[0], namelen); stat->refs = lf->refs; stat->id = lf->id; stat->address = lf->address; stat->size = lf->size; /* Version 2 fields: */ namelen = strlen(lf->pathname) + 1; if (namelen > sizeof(stat->pathname)) namelen = sizeof(stat->pathname); bcopy(lf->pathname, &stat->pathname[0], namelen); sx_xunlock(&kld_sx); td->td_retval[0] = 0; return (0); } #ifdef DDB DB_COMMAND(kldstat, db_kldstat) { linker_file_t lf; #define POINTER_WIDTH ((int)(sizeof(void *) * 2 + 2)) db_printf("Id Refs Address%*c Size Name\n", POINTER_WIDTH - 7, ' '); #undef POINTER_WIDTH TAILQ_FOREACH(lf, &linker_files, link) { if (db_pager_quit) return; db_printf("%2d %4d %p %-8zx %s\n", lf->id, lf->refs, lf->address, lf->size, lf->filename); } } #endif /* DDB */ int sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap) { linker_file_t lf; module_t mp; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(uap->fileid); if (lf) { MOD_SLOCK; mp = TAILQ_FIRST(&lf->modules); if (mp != NULL) td->td_retval[0] = module_getid(mp); else td->td_retval[0] = 0; MOD_SUNLOCK; } else error = ENOENT; sx_xunlock(&kld_sx); return (error); } int sys_kldsym(struct thread *td, struct kldsym_args *uap) { char *symstr = NULL; c_linker_sym_t sym; linker_symval_t symval; linker_file_t lf; struct kld_sym_lookup lookup; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0) return (error); if (lookup.version != sizeof(lookup) || uap->cmd != KLDSYM_LOOKUP) return (EINVAL); symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0) goto out; sx_xlock(&kld_sx); if (uap->fileid != 0) { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) error = ENOENT; else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t) symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); } else error = ENOENT; } else { TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t)symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); break; } } if (lf == NULL) error = ENOENT; } sx_xunlock(&kld_sx); out: free(symstr, M_TEMP); return (error); } /* * Preloaded module support */ static modlist_t modlist_lookup(const char *name, int ver) { modlist_t mod; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) == 0 && (ver == 0 || mod->version == ver)) return (mod); } return (NULL); } static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo) { modlist_t mod, bestmod; int ver; if (verinfo == NULL) return (modlist_lookup(name, 0)); bestmod = NULL; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) != 0) continue; ver = mod->version; if (ver == verinfo->md_ver_preferred) return (mod); if (ver >= verinfo->md_ver_minimum && ver <= verinfo->md_ver_maximum && (bestmod == NULL || ver > bestmod->version)) bestmod = mod; } return (bestmod); } static modlist_t modlist_newmodule(const char *modname, int version, linker_file_t container) { modlist_t mod; mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO); if (mod == NULL) panic("no memory for module list"); mod->container = container; mod->name = modname; mod->version = version; TAILQ_INSERT_TAIL(&found_modules, mod, link); return (mod); } static void linker_addmodules(linker_file_t lf, struct mod_metadata **start, struct mod_metadata **stop, int preload) { struct mod_metadata *mp, **mdp; const char *modname; int ver; for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; if (modlist_lookup(modname, ver) != NULL) { printf("module %s already present!\n", modname); /* XXX what can we do? this is a build error. :-( */ continue; } modlist_newmodule(modname, ver, lf); } } static void linker_preload(void *arg) { caddr_t modptr; const char *modname, *nmodname; char *modtype; linker_file_t lf, nlf; linker_class_t lc; int error; linker_file_list_t loaded_files; linker_file_list_t depended_files; struct mod_metadata *mp, *nmp; struct mod_metadata **start, **stop, **mdp, **nmdp; const struct mod_depend *verinfo; int nver; int resolves; modlist_t mod; struct sysinit **si_start, **si_stop; TAILQ_INIT(&loaded_files); TAILQ_INIT(&depended_files); TAILQ_INIT(&found_modules); error = 0; modptr = NULL; sx_xlock(&kld_sx); while ((modptr = preload_search_next_name(modptr)) != NULL) { modname = (char *)preload_search_info(modptr, MODINFO_NAME); modtype = (char *)preload_search_info(modptr, MODINFO_TYPE); if (modname == NULL) { printf("Preloaded module at %p does not have a" " name!\n", modptr); continue; } if (modtype == NULL) { printf("Preloaded module at %p does not have a type!\n", modptr); continue; } if (bootverbose) printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, modptr); lf = NULL; TAILQ_FOREACH(lc, &classes, link) { error = LINKER_LINK_PRELOAD(lc, modname, &lf); if (!error) break; lf = NULL; } if (lf) TAILQ_INSERT_TAIL(&loaded_files, lf, loaded); } /* * First get a list of stuff in the kernel. */ if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start, &stop, NULL) == 0) linker_addmodules(linker_kernel_file, start, stop, 1); /* * This is a once-off kinky bubble sort to resolve relocation * dependency requirements. */ restart: TAILQ_FOREACH(lf, &loaded_files, loaded) { error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); /* * First, look to see if we would successfully link with this * stuff. */ resolves = 1; /* unless we know otherwise */ if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop) /* it's a self reference */ continue; /* * ok, the module isn't here yet, we * are not finished */ if (modlist_lookup2(modname, verinfo) == NULL) resolves = 0; } } /* * OK, if we found our modules, we can link. So, "provide" * the modules inside and add it to the end of the link order * list. */ if (resolves) { if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; nver = ((const struct mod_version *) mp->md_data)->mv_version; if (modlist_lookup(modname, nver) != NULL) { printf("module %s already" " present!\n", modname); TAILQ_REMOVE(&loaded_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); /* we changed tailq next ptr */ goto restart; } modlist_newmodule(modname, nver, lf); } } TAILQ_REMOVE(&loaded_files, lf, loaded); TAILQ_INSERT_TAIL(&depended_files, lf, loaded); /* * Since we provided modules, we need to restart the * sort so that the previous files that depend on us * have a chance. Also, we've busted the tailq next * pointer with the REMOVE. */ goto restart; } } /* * At this point, we check to see what could not be resolved.. */ while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) { TAILQ_REMOVE(&loaded_files, lf, loaded); printf("KLD file %s is missing dependencies\n", lf->filename); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } /* * We made it. Finish off the linking in the order we determined. */ TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) { if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) panic("cannot add dependency"); } error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { printf("KLD file %s - cannot find " "dependency \"%s\"\n", lf->filename, modname); goto fail; } /* Don't count self-dependencies */ if (lf == mod->container) continue; mod->container->refs++; error = linker_file_add_dependency(lf, mod->container); if (error) panic("cannot add dependency"); } } /* * Now do relocation etc using the symbol search paths * established by the dependencies */ error = LINKER_LINK_PRELOAD_FINISH(lf); if (error) { printf("KLD file %s - could not finalize loading\n", lf->filename); goto fail; } linker_file_register_modules(lf); if (!TAILQ_EMPTY(&lf->modules)) lf->flags |= LINKER_FILE_MODULES; if (linker_file_lookup_set(lf, "sysinit_set", &si_start, &si_stop, NULL) == 0) sysinit_add(si_start, si_stop); linker_file_register_sysctls(lf, true); lf->flags |= LINKER_FILE_LINKED; continue; fail: TAILQ_REMOVE(&depended_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } sx_xunlock(&kld_sx); /* woohoo! we made it! */ } SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0); /* * Handle preload files that failed to load any modules. */ static void linker_preload_finish(void *arg) { linker_file_t lf, nlf; sx_xlock(&kld_sx); TAILQ_FOREACH_SAFE(lf, &linker_files, link, nlf) { /* * If all of the modules in this file failed to load, unload * the file and return an error of ENOEXEC. (Parity with * linker_load_file.) */ if ((lf->flags & LINKER_FILE_MODULES) != 0 && TAILQ_EMPTY(&lf->modules)) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); continue; } lf->flags &= ~LINKER_FILE_MODULES; lf->userrefs++; /* so we can (try to) kldunload it */ } sx_xunlock(&kld_sx); } /* * Attempt to run after all DECLARE_MODULE SYSINITs. Unfortunately they can be * scheduled at any subsystem and order, so run this as late as possible. init * becomes runnable in SI_SUB_KTHREAD_INIT, so go slightly before that. */ SYSINIT(preload_finish, SI_SUB_KTHREAD_INIT - 100, SI_ORDER_MIDDLE, linker_preload_finish, 0); /* * Search for a not-loaded module by name. * * Modules may be found in the following locations: * * - preloaded (result is just the module name) - on disk (result is full path * to module) * * If the module name is qualified in any way (contains path, etc.) the we * simply return a copy of it. * * The search path can be manipulated via sysctl. Note that we use the ';' * character as a separator to be consistent with the bootloader. */ static char linker_hintfile[] = "linker.hints"; static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules"; SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RWTUN, linker_path, sizeof(linker_path), "module load search path"); TUNABLE_STR("module_path", linker_path, sizeof(linker_path)); static char *linker_ext_list[] = { "", ".ko", NULL }; /* * Check if file actually exists either with or without extension listed in * the linker_ext_list. (probably should be generic for the rest of the * kernel) */ static char * linker_lookup_file(const char *path, int pathlen, const char *name, int namelen, struct vattr *vap) { struct nameidata nd; struct thread *td = curthread; /* XXX */ char *result, **cpp, *sep; int error, len, extlen, reclen, flags; enum vtype type; extlen = 0; for (cpp = linker_ext_list; *cpp; cpp++) { len = strlen(*cpp); if (len > extlen) extlen = len; } extlen++; /* trailing '\0' */ sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = pathlen + strlen(sep) + namelen + extlen + 1; result = malloc(reclen, M_LINKER, M_WAITOK); for (cpp = linker_ext_list; *cpp; cpp++) { snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep, namelen, name, *cpp); /* * Attempt to open the file, and return the path if * we succeed and it's a regular file. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); type = nd.ni_vp->v_type; if (vap) VOP_GETATTR(nd.ni_vp, vap, td->td_ucred); VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (type == VREG) return (result); } } free(result, M_LINKER); return (NULL); } #define INT_ALIGN(base, ptr) ptr = \ (base) + roundup2((ptr) - (base), sizeof(int)) /* * Lookup KLD which contains requested module in the "linker.hints" file. If * version specification is available, then try to find the best KLD. * Otherwise just find the latest one. */ static char * linker_hints_lookup(const char *path, int pathlen, const char *modname, int modnamelen, const struct mod_depend *verinfo) { struct thread *td = curthread; /* XXX */ struct ucred *cred = td ? td->td_ucred : NULL; struct nameidata nd; struct vattr vattr, mattr; u_char *hints = NULL; u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep; int error, ival, bestver, *intp, found, flags, clen, blen; ssize_t reclen; result = NULL; bestver = found = 0; sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen + strlen(sep) + 1; pathbuf = malloc(reclen, M_LINKER, M_WAITOK); snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep, linker_hintfile); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) goto bad; NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) goto bad; best = cp = NULL; error = VOP_GETATTR(nd.ni_vp, &vattr, cred); if (error) goto bad; /* * XXX: we need to limit this number to some reasonable value */ if (vattr.va_size > LINKER_HINTS_MAX) { printf("hints file too large %ld\n", (long)vattr.va_size); goto bad; } hints = malloc(vattr.va_size, M_TEMP, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0, UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td); if (error) goto bad; VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, cred, td); nd.ni_vp = NULL; if (reclen != 0) { printf("can't read %zd\n", reclen); goto bad; } intp = (int *)hints; ival = *intp++; if (ival != LINKER_HINTS_VERSION) { printf("hints file version mismatch %d\n", ival); goto bad; } bufend = hints + vattr.va_size; recptr = (u_char *)intp; clen = blen = 0; while (recptr < bufend && !found) { intp = (int *)recptr; reclen = *intp++; ival = *intp++; cp = (char *)intp; switch (ival) { case MDT_VERSION: clen = *cp++; if (clen != modnamelen || bcmp(cp, modname, clen) != 0) break; cp += clen; INT_ALIGN(hints, cp); ival = *(int *)cp; cp += sizeof(int); clen = *cp++; if (verinfo == NULL || ival == verinfo->md_ver_preferred) { found = 1; break; } if (ival >= verinfo->md_ver_minimum && ival <= verinfo->md_ver_maximum && ival > bestver) { bestver = ival; best = cp; blen = clen; } break; default: break; } recptr += reclen + sizeof(int); } /* * Finally check if KLD is in the place */ if (found) result = linker_lookup_file(path, pathlen, cp, clen, &mattr); else if (best) result = linker_lookup_file(path, pathlen, best, blen, &mattr); /* * KLD is newer than hints file. What we should do now? */ if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >)) printf("warning: KLD '%s' is newer than the linker.hints" " file\n", result); bad: free(pathbuf, M_LINKER); if (hints) free(hints, M_TEMP); if (nd.ni_vp != NULL) { VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, cred, td); } /* * If nothing found or hints is absent - fallback to the old * way by using "kldname[.ko]" as module name. */ if (!found && !bestver && result == NULL) result = linker_lookup_file(path, pathlen, modname, modnamelen, NULL); return (result); } /* * Lookup KLD which contains requested module in the all directories. */ static char * linker_search_module(const char *modname, int modnamelen, const struct mod_depend *verinfo) { char *cp, *ep, *result; /* * traverse the linker path */ for (cp = linker_path; *cp; cp = ep + 1) { /* find the end of this component */ for (ep = cp; (*ep != 0) && (*ep != ';'); ep++); result = linker_hints_lookup(cp, ep - cp, modname, modnamelen, verinfo); if (result != NULL) return (result); if (*ep == 0) break; } return (NULL); } /* * Search for module in all directories listed in the linker_path. */ static char * linker_search_kld(const char *name) { char *cp, *ep, *result; int len; /* qualified at all? */ if (strchr(name, '/')) return (strdup(name, M_LINKER)); /* traverse the linker path */ len = strlen(name); for (ep = linker_path; *ep; ep++) { cp = ep; /* find the end of this component */ for (; *ep != 0 && *ep != ';'; ep++); result = linker_lookup_file(cp, ep - cp, name, len, NULL); if (result != NULL) return (result); } return (NULL); } static const char * linker_basename(const char *path) { const char *filename; filename = strrchr(path, '/'); if (filename == NULL) return path; if (filename[1]) filename++; return (filename); } #ifdef HWPMC_HOOKS /* * Inform hwpmc about the set of kernel modules currently loaded. */ void * linker_hwpmc_list_objects(void) { linker_file_t lf; struct pmckern_map_in *kobase; int i, nmappings; nmappings = 0; sx_slock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) nmappings++; /* Allocate nmappings + 1 entries. */ kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in), M_LINKER, M_WAITOK | M_ZERO); i = 0; TAILQ_FOREACH(lf, &linker_files, link) { /* Save the info for this linker file. */ kobase[i].pm_file = lf->filename; kobase[i].pm_address = (uintptr_t)lf->address; i++; } sx_sunlock(&kld_sx); KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?")); /* The last entry of the malloced area comprises of all zeros. */ KASSERT(kobase[i].pm_file == NULL, ("linker_hwpmc_list_objects: last object not NULL")); return ((void *)kobase); } #endif /* * Find a file which contains given module and load it, if "parent" is not * NULL, register a reference to it. */ static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp) { linker_file_t lfdep; const char *filename; char *pathname; int error; sx_assert(&kld_sx, SA_XLOCKED); if (modname == NULL) { /* * We have to load KLD */ KASSERT(verinfo == NULL, ("linker_load_module: verinfo" " is not NULL")); pathname = linker_search_kld(kldname); } else { if (modlist_lookup2(modname, verinfo) != NULL) return (EEXIST); if (kldname != NULL) pathname = strdup(kldname, M_LINKER); else if (rootvnode == NULL) pathname = NULL; else /* * Need to find a KLD with required module */ pathname = linker_search_module(modname, strlen(modname), verinfo); } if (pathname == NULL) return (ENOENT); /* * Can't load more than one file with the same basename XXX: * Actually it should be possible to have multiple KLDs with * the same basename but different path because they can * provide different versions of the same modules. */ filename = linker_basename(pathname); if (linker_find_file_by_name(filename)) error = EEXIST; else do { error = linker_load_file(pathname, &lfdep); if (error) break; if (modname && verinfo && modlist_lookup2(modname, verinfo) == NULL) { linker_file_unload(lfdep, LINKER_UNLOAD_FORCE); error = ENOENT; break; } if (parent) { error = linker_file_add_dependency(parent, lfdep); if (error) break; } if (lfpp) *lfpp = lfdep; } while (0); free(pathname, M_LINKER); return (error); } /* * This routine is responsible for finding dependencies of userland initiated * kldload(2)'s of files. */ int linker_load_dependencies(linker_file_t lf) { linker_file_t lfdep; struct mod_metadata **start, **stop, **mdp, **nmdp; struct mod_metadata *mp, *nmp; const struct mod_depend *verinfo; modlist_t mod; const char *modname, *nmodname; int ver, error = 0, count; /* * All files are dependent on /kernel. */ sx_assert(&kld_sx, SA_XLOCKED); if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) return (error); } if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, &count) != 0) return (0); for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; mod = modlist_lookup(modname, ver); if (mod != NULL) { printf("interface %s.%d already present in the KLD" " '%s'!\n", modname, ver, mod->container->filename); return (EEXIST); } } for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; nmodname = NULL; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop)/* early exit, it's a self reference */ continue; mod = modlist_lookup2(modname, verinfo); if (mod) { /* woohoo, it's loaded already */ lfdep = mod->container; lfdep->refs++; error = linker_file_add_dependency(lf, lfdep); if (error) break; continue; } error = linker_load_module(NULL, modname, lf, verinfo, NULL); if (error) { printf("KLD %s: depends on %s - not available or" " version mismatch\n", lf->filename, modname); break; } } if (error) return (error); linker_addmodules(lf, start, stop, 0); return (error); } static int sysctl_kern_function_list_iterate(const char *name, void *opaque) { struct sysctl_req *req; req = opaque; return (SYSCTL_OUT(req, name, strlen(name) + 1)); } /* * Export a nul-separated, double-nul-terminated list of all function names * in the kernel. */ static int sysctl_kern_function_list(SYSCTL_HANDLER_ARGS) { linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(req->td->td_ucred); if (error) return (error); #endif error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { error = LINKER_EACH_FUNCTION_NAME(lf, sysctl_kern_function_list_iterate, req); if (error) { sx_xunlock(&kld_sx); return (error); } } sx_xunlock(&kld_sx); return (SYSCTL_OUT(req, "", 1)); } SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_kern_function_list, "", "kernel function list"); Index: head/sys/kern/kern_lock.c =================================================================== --- head/sys/kern/kern_lock.c (revision 326270) +++ head/sys/kern/kern_lock.c (revision 326271) @@ -1,1547 +1,1549 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Attilio Rao * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #ifdef DEBUG_LOCKS #include #endif #include #include #include #ifdef DDB #include #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) == (LK_ADAPTIVE | LK_NOSHARE)); CTASSERT(LK_UNLOCKED == (LK_UNLOCKED & ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS))); #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 #ifndef INVARIANTS #define _lockmgr_assert(lk, what, file, line) #endif #define TD_SLOCKS_INC(td) ((td)->td_lk_slocks++) #define TD_SLOCKS_DEC(td) ((td)->td_lk_slocks--) #ifndef DEBUG_LOCKS #define STACK_PRINT(lk) #define STACK_SAVE(lk) #define STACK_ZERO(lk) #else #define STACK_PRINT(lk) stack_print_ddb(&(lk)->lk_stack) #define STACK_SAVE(lk) stack_save(&(lk)->lk_stack) #define STACK_ZERO(lk) stack_zero(&(lk)->lk_stack) #endif #define LOCK_LOG2(lk, string, arg1, arg2) \ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ CTR2(KTR_LOCK, (string), (arg1), (arg2)) #define LOCK_LOG3(lk, string, arg1, arg2, arg3) \ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3)) #define GIANT_DECLARE \ int _i = 0; \ WITNESS_SAVE_DECL(Giant) #define GIANT_RESTORE() do { \ if (_i > 0) { \ while (_i--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) #define GIANT_SAVE() do { \ if (mtx_owned(&Giant)) { \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _i++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) #define LK_CAN_SHARE(x, flags) \ (((x) & LK_SHARE) && \ (((x) & (LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) == 0 || \ (curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) || \ (curthread->td_pflags & TDP_DEADLKTREAT))) #define LK_TRYOP(x) \ ((x) & LK_NOWAIT) #define LK_CAN_WITNESS(x) \ (((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x)) #define LK_TRYWIT(x) \ (LK_TRYOP(x) ? LOP_TRYLOCK : 0) #define LK_CAN_ADAPT(lk, f) \ (((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 && \ ((f) & LK_SLEEPFAIL) == 0) #define lockmgr_disowned(lk) \ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC) #define lockmgr_xlocked(lk) \ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread) static void assert_lockmgr(const struct lock_object *lock, int how); #ifdef DDB static void db_show_lockmgr(const struct lock_object *lock); #endif static void lock_lockmgr(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_lockmgr(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_lockmgr(struct lock_object *lock); struct lock_class lock_class_lockmgr = { .lc_name = "lockmgr", .lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE, .lc_assert = assert_lockmgr, #ifdef DDB .lc_ddb_show = db_show_lockmgr, #endif .lc_lock = lock_lockmgr, .lc_unlock = unlock_lockmgr, #ifdef KDTRACE_HOOKS .lc_owner = owner_lockmgr, #endif }; static bool __always_inline lockmgr_slock_try(struct lock *lk, uintptr_t *xp, int flags); static bool __always_inline lockmgr_sunlock_try(struct lock *lk, uintptr_t x); static void lockmgr_note_shared_acquire(struct lock *lk, int contested, uint64_t waittime, const char *file, int line, int flags) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file, line); WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); TD_SLOCKS_INC(curthread); STACK_SAVE(lk); } static void lockmgr_note_shared_release(struct lock *lk, const char *file, int line) { lock_profile_release_lock(&lk->lock_object); WITNESS_UNLOCK(&lk->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line); TD_LOCKS_DEC(curthread); TD_SLOCKS_DEC(curthread); } static void lockmgr_note_exclusive_acquire(struct lock *lk, int contested, uint64_t waittime, const char *file, int line, int flags) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); STACK_SAVE(lk); } static void lockmgr_note_exclusive_release(struct lock *lk, const char *file, int line) { lock_profile_release_lock(&lk->lock_object); LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); } static void lockmgr_note_exclusive_upgrade(struct lock *lk, const char *file, int line, int flags) { LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_SLOCKS_DEC(curthread); } static __inline struct thread * lockmgr_xholder(const struct lock *lk) { uintptr_t x; x = lk->lk_lock; return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x)); } /* * It assumes sleepq_lock held and returns with this one unheld. * It also assumes the generic interlock is sane and previously checked. * If LK_INTERLOCK is specified the interlock is not reacquired after the * sleep. */ static __inline int sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int pri, int timo, int queue) { GIANT_DECLARE; struct lock_class *class; int catch, error; class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; catch = pri & PCATCH; pri &= PRIMASK; error = 0; LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk, (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared"); if (flags & LK_INTERLOCK) class->lc_unlock(ilk); if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0) lk->lk_exslpfail++; GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ? SLEEPQ_INTERRUPTIBLE : 0), queue); if ((flags & LK_TIMELOCK) && timo) sleepq_set_timeout(&lk->lock_object, timo); /* * Decisional switch for real sleeping. */ if ((flags & LK_TIMELOCK) && timo && catch) error = sleepq_timedwait_sig(&lk->lock_object, pri); else if ((flags & LK_TIMELOCK) && timo) error = sleepq_timedwait(&lk->lock_object, pri); else if (catch) error = sleepq_wait_sig(&lk->lock_object, pri); else sleepq_wait(&lk->lock_object, pri); GIANT_RESTORE(); if ((flags & LK_SLEEPFAIL) && error == 0) error = ENOLCK; return (error); } static __inline int wakeupshlk(struct lock *lk, const char *file, int line) { uintptr_t v, x; u_int realexslp; int queue, wakeup_swapper; wakeup_swapper = 0; for (;;) { x = lk->lk_lock; if (lockmgr_sunlock_try(lk, x)) break; /* * We should have a sharer with waiters, so enter the hard * path in order to handle wakeups correctly. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); v = LK_UNLOCKED; /* * If the lock has exclusive waiters, give them preference in * order to avoid deadlock with shared runners up. * If interruptible sleeps left the exclusive queue empty * avoid a starvation for the threads sleeping on the shared * queue by giving them precedence and cleaning up the * exclusive waiters bit anyway. * Please note that lk_exslpfail count may be lying about * the real number of waiters with the LK_SLEEPFAIL flag on * because they may be used in conjunction with interruptible * sleeps so lk_exslpfail might be considered an 'upper limit' * bound, including the edge cases. */ realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { if (lk->lk_exslpfail < realexslp) { lk->lk_exslpfail = 0; queue = SQ_EXCLUSIVE_QUEUE; v |= (x & LK_SHARED_WAITERS); } else { lk->lk_exslpfail = 0; LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); queue = SQ_SHARED_QUEUE; } } else { /* * Exclusive waiters sleeping with LK_SLEEPFAIL on * and using interruptible sleeps/timeout may have * left spourious lk_exslpfail counts on, so clean * it up anyway. */ lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; } if (!atomic_cmpset_rel_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x, v)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; } lockmgr_note_shared_release(lk, file, line); return (wakeup_swapper); } static void assert_lockmgr(const struct lock_object *lock, int what) { panic("lockmgr locks do not support assertions"); } static void lock_lockmgr(struct lock_object *lock, uintptr_t how) { panic("lockmgr locks do not support sleep interlocking"); } static uintptr_t unlock_lockmgr(struct lock_object *lock) { panic("lockmgr locks do not support sleep interlocking"); } #ifdef KDTRACE_HOOKS static int owner_lockmgr(const struct lock_object *lock, struct thread **owner) { panic("lockmgr locks do not support owner inquiring"); } #endif void lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags) { int iflags; MPASS((flags & ~LK_INIT_MASK) == 0); ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock, ("%s: lockmgr not aligned for %s: %p", __func__, wmesg, &lk->lk_lock)); iflags = LO_SLEEPABLE | LO_UPGRADABLE; if (flags & LK_CANRECURSE) iflags |= LO_RECURSABLE; if ((flags & LK_NODUP) == 0) iflags |= LO_DUPOK; if (flags & LK_NOPROFILE) iflags |= LO_NOPROFILE; if ((flags & LK_NOWITNESS) == 0) iflags |= LO_WITNESS; if (flags & LK_QUIET) iflags |= LO_QUIET; if (flags & LK_IS_VNODE) iflags |= LO_IS_VNODE; iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE); lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags); lk->lk_lock = LK_UNLOCKED; lk->lk_recurse = 0; lk->lk_exslpfail = 0; lk->lk_timo = timo; lk->lk_pri = pri; STACK_ZERO(lk); } /* * XXX: Gross hacks to manipulate external lock flags after * initialization. Used for certain vnode and buf locks. */ void lockallowshare(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags &= ~LK_NOSHARE; } void lockdisableshare(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags |= LK_NOSHARE; } void lockallowrecurse(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags |= LO_RECURSABLE; } void lockdisablerecurse(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags &= ~LO_RECURSABLE; } void lockdestroy(struct lock *lk) { KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held")); KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed")); KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters")); lock_destroy(&lk->lock_object); } static bool __always_inline lockmgr_slock_try(struct lock *lk, uintptr_t *xp, int flags) { /* * If no other thread has an exclusive lock, or * no exclusive waiter is present, bump the count of * sharers. Since we have to preserve the state of * waiters, if we fail to acquire the shared lock * loop back and retry. */ *xp = lk->lk_lock; while (LK_CAN_SHARE(*xp, flags)) { if (atomic_fcmpset_acq_ptr(&lk->lk_lock, xp, *xp + LK_ONE_SHARER)) { return (true); } } return (false); } static bool __always_inline lockmgr_sunlock_try(struct lock *lk, uintptr_t x) { for (;;) { /* * If there is more than one shared lock held, just drop one * and return. */ if (LK_SHARERS(x) > 1) { if (atomic_fcmpset_rel_ptr(&lk->lk_lock, &x, x - LK_ONE_SHARER)) return (true); continue; } /* * If there are not waiters on the exclusive queue, drop the * lock quickly. */ if ((x & LK_ALL_WAITERS) == 0) { MPASS((x & ~LK_EXCLUSIVE_SPINNERS) == LK_SHARERS_LOCK(1)); if (atomic_fcmpset_rel_ptr(&lk->lk_lock, &x, LK_UNLOCKED)) return (true); continue; } break; } return (false); } int lockmgr_lock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk, const char *file, int line) { struct lock_class *class; uintptr_t x, v, tid; u_int op; bool locked; op = flags & LK_TYPE_MASK; locked = false; switch (op) { case LK_SHARED: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, flags & LK_INTERLOCK ? ilk : NULL); if (__predict_false(lk->lock_object.lo_flags & LK_NOSHARE)) break; if (lockmgr_slock_try(lk, &x, flags)) { lockmgr_note_shared_acquire(lk, 0, 0, file, line, flags); locked = true; } break; case LK_EXCLUSIVE: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); tid = (uintptr_t)curthread; if (lk->lk_lock == LK_UNLOCKED && atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) { lockmgr_note_exclusive_acquire(lk, 0, 0, file, line, flags); locked = true; } break; case LK_UPGRADE: case LK_TRYUPGRADE: _lockmgr_assert(lk, KA_SLOCKED, file, line); tid = (uintptr_t)curthread; v = lk->lk_lock; x = v & LK_ALL_WAITERS; v &= LK_EXCLUSIVE_SPINNERS; if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v, tid | x)) { lockmgr_note_exclusive_upgrade(lk, file, line, flags); locked = true; } break; default: break; } if (__predict_true(locked)) { if (__predict_false(flags & LK_INTERLOCK)) { class = LOCK_CLASS(ilk); class->lc_unlock(ilk); } return (0); } else { return (__lockmgr_args(lk, flags, ilk, LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, file, line)); } } int lockmgr_unlock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk) { struct lock_class *class; uintptr_t x, tid; bool unlocked; const char *file; int line; file = __FILE__; line = __LINE__; _lockmgr_assert(lk, KA_LOCKED, file, line); unlocked = false; x = lk->lk_lock; if (__predict_true(x & LK_SHARE) != 0) { if (lockmgr_sunlock_try(lk, x)) { lockmgr_note_shared_release(lk, file, line); unlocked = true; } } else { tid = (uintptr_t)curthread; if (!lockmgr_recursed(lk) && atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) { lockmgr_note_exclusive_release(lk, file, line); unlocked = true; } } if (__predict_true(unlocked)) { if (__predict_false(flags & LK_INTERLOCK)) { class = LOCK_CLASS(ilk); class->lc_unlock(ilk); } return (0); } else { return (__lockmgr_args(lk, flags | LK_RELEASE, ilk, LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, LOCK_FILE, LOCK_LINE)); } } int __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int pri, int timo, const char *file, int line) { GIANT_DECLARE; struct lock_class *class; const char *iwmesg; uintptr_t tid, v, x; u_int op, realexslp; int error, ipri, itimo, queue, wakeup_swapper; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif error = 0; tid = (uintptr_t)curthread; op = (flags & LK_TYPE_MASK); iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg; ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri; itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo; MPASS((flags & ~LK_TOTAL_MASK) == 0); KASSERT((op & (op - 1)) == 0, ("%s: Invalid requested operation @ %s:%d", __func__, file, line)); KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 || (op != LK_DOWNGRADE && op != LK_RELEASE), ("%s: Invalid flags in regard of the operation desired @ %s:%d", __func__, file, line)); KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL, ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d", __func__, file, line)); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread, lk->lock_object.lo_name, file, line)); class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; if (panicstr != NULL) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); return (0); } if (lk->lock_object.lo_flags & LK_NOSHARE) { switch (op) { case LK_SHARED: op = LK_EXCLUSIVE; break; case LK_UPGRADE: case LK_TRYUPGRADE: case LK_DOWNGRADE: _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED, file, line); if (flags & LK_INTERLOCK) class->lc_unlock(ilk); return (0); } } wakeup_swapper = 0; switch (op) { case LK_SHARED: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, flags & LK_INTERLOCK ? ilk : NULL); for (;;) { if (lockmgr_slock_try(lk, &x, flags)) break; #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is already held by curthread in * exclusive way avoid a deadlock. */ if (LK_HOLDER(x) == tid) { LOCK_LOG2(lk, "%s: %p already held in exclusive mode", __func__, lk); error = EDEADLK; break; } /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock can be acquired in shared mode, try * again. */ if (LK_CAN_SHARE(x, flags)) { sleepq_release(&lk->lock_object); continue; } /* * Try to set the LK_SHARED_WAITERS flag. If we fail, * loop back and retry. */ if ((x & LK_SHARED_WAITERS) == 0) { if (!atomic_cmpset_acq_ptr(&lk->lk_lock, x, x | LK_SHARED_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set shared waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * shared lock and the shared waiters flag is set, * we will sleep. */ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, SQ_SHARED_QUEUE); flags &= ~LK_INTERLOCK; if (error) { LOCK_LOG3(lk, "%s: interrupted sleep for %p with %d", __func__, lk, error); break; } LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { #ifdef LOCK_PROFILING lockmgr_note_shared_acquire(lk, contested, waittime, file, line, flags); #else lockmgr_note_shared_acquire(lk, 0, 0, file, line, flags); #endif } break; case LK_UPGRADE: case LK_TRYUPGRADE: _lockmgr_assert(lk, KA_SLOCKED, file, line); v = lk->lk_lock; x = v & LK_ALL_WAITERS; v &= LK_EXCLUSIVE_SPINNERS; /* * Try to switch from one shared lock to an exclusive one. * We need to preserve waiters flags during the operation. */ if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v, tid | x)) { LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_SLOCKS_DEC(curthread); break; } /* * In LK_TRYUPGRADE mode, do not drop the lock, * returning EBUSY instead. */ if (op == LK_TRYUPGRADE) { LOCK_LOG2(lk, "%s: %p failed the nowait upgrade", __func__, lk); error = EBUSY; break; } /* * We have been unable to succeed in upgrading, so just * give up the shared lock. */ wakeup_swapper |= wakeupshlk(lk, file, line); /* FALLTHROUGH */ case LK_EXCLUSIVE: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); /* * If curthread already holds the lock and this one is * allowed to recurse, simply recurse on it. */ if (lockmgr_xlocked(lk)) { if ((flags & LK_CANRECURSE) == 0 && (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) { /* * If the lock is expected to not panic just * give up and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: recursing on non recursive lockmgr %s @ %s:%d\n", __func__, iwmesg, file, line); } lk->lk_recurse++; LOCK_LOG2(lk, "%s: %p recursing", __func__, lk); LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); break; } for (;;) { if (lk->lk_lock == LK_UNLOCKED && atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) break; #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock has been released while we spun on * the sleepqueue chain lock just try again. */ if (x == LK_UNLOCKED) { sleepq_release(&lk->lock_object); continue; } /* * The lock can be in the state where there is a * pending queue of waiters, but still no owner. * This happens when the lock is contested and an * owner is going to claim the lock. * If curthread is the one successfully acquiring it * claim lock ownership and return, preserving waiters * flags. */ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v &= ~LK_EXCLUSIVE_SPINNERS; if (atomic_cmpset_acq_ptr(&lk->lk_lock, x, tid | v)) { sleepq_release(&lk->lock_object); LOCK_LOG2(lk, "%s: %p claimed by a new writer", __func__, lk); break; } sleepq_release(&lk->lock_object); continue; } /* * Try to set the LK_EXCLUSIVE_WAITERS flag. If we * fail, loop back and retry. */ if ((x & LK_EXCLUSIVE_WAITERS) == 0) { if (!atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set excl waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * exclusive lock and the exclusive waiters flag * is set, we will sleep. */ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, SQ_EXCLUSIVE_QUEUE); flags &= ~LK_INTERLOCK; if (error) { LOCK_LOG3(lk, "%s: interrupted sleep for %p with %d", __func__, lk, error); break; } LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { #ifdef LOCK_PROFILING lockmgr_note_exclusive_acquire(lk, contested, waittime, file, line, flags); #else lockmgr_note_exclusive_acquire(lk, 0, 0, file, line, flags); #endif } break; case LK_DOWNGRADE: _lockmgr_assert(lk, KA_XLOCKED, file, line); LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line); /* * Panic if the lock is recursed. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n", __func__, iwmesg, file, line); } TD_SLOCKS_INC(curthread); /* * In order to preserve waiters flags, just spin. */ for (;;) { x = lk->lk_lock; MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); x &= LK_ALL_WAITERS; if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, LK_SHARERS_LOCK(1) | x)) break; cpu_spinwait(); } break; case LK_RELEASE: _lockmgr_assert(lk, KA_LOCKED, file, line); x = lk->lk_lock; if ((x & LK_SHARE) == 0) { /* * As first option, treact the lock as if it has not * any waiter. * Fix-up the tid var if the lock has been disowned. */ if (LK_HOLDER(x) == LK_KERNPROC) tid = LK_KERNPROC; else { WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); } LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); /* * The lock is held in exclusive mode. * If the lock is recursed also, then unrecurse it. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { LOCK_LOG2(lk, "%s: %p unrecursing", __func__, lk); lk->lk_recurse--; break; } if (tid != LK_KERNPROC) lock_profile_release_lock(&lk->lock_object); if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) break; sleepq_lock(&lk->lock_object); x = lk->lk_lock; v = LK_UNLOCKED; /* * If the lock has exclusive waiters, give them * preference in order to avoid deadlock with * shared runners up. * If interruptible sleeps left the exclusive queue * empty avoid a starvation for the threads sleeping * on the shared queue by giving them precedence * and cleaning up the exclusive waiters bit anyway. * Please note that lk_exslpfail count may be lying * about the real number of waiters with the * LK_SLEEPFAIL flag on because they may be used in * conjunction with interruptible sleeps so * lk_exslpfail might be considered an 'upper limit' * bound, including the edge cases. */ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { if (lk->lk_exslpfail < realexslp) { lk->lk_exslpfail = 0; queue = SQ_EXCLUSIVE_QUEUE; v |= (x & LK_SHARED_WAITERS); } else { lk->lk_exslpfail = 0; LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); queue = SQ_SHARED_QUEUE; } } else { /* * Exclusive waiters sleeping with LK_SLEEPFAIL * on and using interruptible sleeps/timeout * may have left spourious lk_exslpfail counts * on, so clean it up anyway. */ lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; } LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); atomic_store_rel_ptr(&lk->lk_lock, v); wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; } else wakeup_swapper = wakeupshlk(lk, file, line); break; case LK_DRAIN: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); /* * Trying to drain a lock we already own will result in a * deadlock. */ if (lockmgr_xlocked(lk)) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: draining %s with the lock held @ %s:%d\n", __func__, iwmesg, file, line); } for (;;) { if (lk->lk_lock == LK_UNLOCKED && atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) break; #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock has been released while we spun on * the sleepqueue chain lock just try again. */ if (x == LK_UNLOCKED) { sleepq_release(&lk->lock_object); continue; } v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v = (x & ~LK_EXCLUSIVE_SPINNERS); /* * If interruptible sleeps left the exclusive * queue empty avoid a starvation for the * threads sleeping on the shared queue by * giving them precedence and cleaning up the * exclusive waiters bit anyway. * Please note that lk_exslpfail count may be * lying about the real number of waiters with * the LK_SLEEPFAIL flag on because they may * be used in conjunction with interruptible * sleeps so lk_exslpfail might be considered * an 'upper limit' bound, including the edge * cases. */ if (v & LK_EXCLUSIVE_WAITERS) { queue = SQ_EXCLUSIVE_QUEUE; v &= ~LK_EXCLUSIVE_WAITERS; } else { /* * Exclusive waiters sleeping with * LK_SLEEPFAIL on and using * interruptible sleeps/timeout may * have left spourious lk_exslpfail * counts on, so clean it up anyway. */ MPASS(v & LK_SHARED_WAITERS); lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; } if (queue == SQ_EXCLUSIVE_QUEUE) { realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if (lk->lk_exslpfail >= realexslp) { lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; if (realexslp != 0) { LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast( &lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); } } else lk->lk_exslpfail = 0; } if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG3(lk, "%s: %p waking up all threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper |= sleepq_broadcast( &lk->lock_object, SLEEPQ_LK, 0, queue); /* * If shared waiters have been woken up we need * to wait for one of them to acquire the lock * before to set the exclusive waiters in * order to avoid a deadlock. */ if (queue == SQ_SHARED_QUEUE) { for (v = lk->lk_lock; (v & LK_SHARE) && !LK_SHARERS(v); v = lk->lk_lock) cpu_spinwait(); } } /* * Try to set the LK_EXCLUSIVE_WAITERS flag. If we * fail, loop back and retry. */ if ((x & LK_EXCLUSIVE_WAITERS) == 0) { if (!atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set drain waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * exclusive lock and the exclusive waiters flag * is set, we will sleep. */ if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK, SQ_EXCLUSIVE_QUEUE); sleepq_wait(&lk->lock_object, ipri & PRIMASK); GIANT_RESTORE(); LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); STACK_SAVE(lk); } break; default: if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: unknown lockmgr request 0x%x\n", __func__, op); } if (flags & LK_INTERLOCK) class->lc_unlock(ilk); if (wakeup_swapper) kick_proc0(); return (error); } void _lockmgr_disown(struct lock *lk, const char *file, int line) { uintptr_t tid, x; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; _lockmgr_assert(lk, KA_XLOCKED, file, line); /* * Panic if the lock is recursed. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) panic("%s: disown a recursed lockmgr @ %s:%d\n", __func__, file, line); /* * If the owner is already LK_KERNPROC just skip the whole operation. */ if (LK_HOLDER(lk->lk_lock) != tid) return; lock_profile_release_lock(&lk->lock_object); LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line); WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); STACK_SAVE(lk); /* * In order to preserve waiters flags, just spin. */ for (;;) { x = lk->lk_lock; MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); x &= LK_ALL_WAITERS; if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, LK_KERNPROC | x)) return; cpu_spinwait(); } } void lockmgr_printinfo(const struct lock *lk) { struct thread *td; uintptr_t x; if (lk->lk_lock == LK_UNLOCKED) printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name); else if (lk->lk_lock & LK_SHARE) printf("lock type %s: SHARED (count %ju)\n", lk->lock_object.lo_name, (uintmax_t)LK_SHARERS(lk->lk_lock)); else { td = lockmgr_xholder(lk); if (td == (struct thread *)LK_KERNPROC) printf("lock type %s: EXCL by KERNPROC\n", lk->lock_object.lo_name); else printf("lock type %s: EXCL by thread %p " "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name, td, td->td_proc->p_pid, td->td_proc->p_comm, td->td_tid); } x = lk->lk_lock; if (x & LK_EXCLUSIVE_WAITERS) printf(" with exclusive waiters pending\n"); if (x & LK_SHARED_WAITERS) printf(" with shared waiters pending\n"); if (x & LK_EXCLUSIVE_SPINNERS) printf(" with exclusive spinners pending\n"); STACK_PRINT(lk); } int lockstatus(const struct lock *lk) { uintptr_t v, x; int ret; ret = LK_SHARED; x = lk->lk_lock; v = LK_HOLDER(x); if ((x & LK_SHARE) == 0) { if (v == (uintptr_t)curthread || v == LK_KERNPROC) ret = LK_EXCLUSIVE; else ret = LK_EXCLOTHER; } else if (x == LK_UNLOCKED) ret = 0; return (ret); } #ifdef INVARIANT_SUPPORT FEATURE(invariant_support, "Support for modules compiled with INVARIANTS option"); #ifndef INVARIANTS #undef _lockmgr_assert #endif void _lockmgr_assert(const struct lock *lk, int what, const char *file, int line) { int slocked = 0; if (panicstr != NULL) return; switch (what) { case KA_SLOCKED: case KA_SLOCKED | KA_NOTRECURSED: case KA_SLOCKED | KA_RECURSED: slocked = 1; case KA_LOCKED: case KA_LOCKED | KA_NOTRECURSED: case KA_LOCKED | KA_RECURSED: #ifdef WITNESS /* * We cannot trust WITNESS if the lock is held in exclusive * mode and a call to lockmgr_disown() happened. * Workaround this skipping the check if the lock is held in * exclusive mode even for the KA_LOCKED case. */ if (slocked || (lk->lk_lock & LK_SHARE)) { witness_assert(&lk->lock_object, what, file, line); break; } #endif if (lk->lk_lock == LK_UNLOCKED || ((lk->lk_lock & LK_SHARE) == 0 && (slocked || (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))))) panic("Lock %s not %slocked @ %s:%d\n", lk->lock_object.lo_name, slocked ? "share" : "", file, line); if ((lk->lk_lock & LK_SHARE) == 0) { if (lockmgr_recursed(lk)) { if (what & KA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } else if (what & KA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } break; case KA_XLOCKED: case KA_XLOCKED | KA_NOTRECURSED: case KA_XLOCKED | KA_RECURSED: if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)) panic("Lock %s not exclusively locked @ %s:%d\n", lk->lock_object.lo_name, file, line); if (lockmgr_recursed(lk)) { if (what & KA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } else if (what & KA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); break; case KA_UNLOCKED: if (lockmgr_xlocked(lk) || lockmgr_disowned(lk)) panic("Lock %s exclusively locked @ %s:%d\n", lk->lock_object.lo_name, file, line); break; default: panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file, line); } } #endif #ifdef DDB int lockmgr_chain(struct thread *td, struct thread **ownerp) { struct lock *lk; lk = td->td_wchan; if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr) return (0); db_printf("blocked on lockmgr %s", lk->lock_object.lo_name); if (lk->lk_lock & LK_SHARE) db_printf("SHARED (count %ju)\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); else db_printf("EXCL\n"); *ownerp = lockmgr_xholder(lk); return (1); } static void db_show_lockmgr(const struct lock_object *lock) { struct thread *td; const struct lock *lk; lk = (const struct lock *)lock; db_printf(" state: "); if (lk->lk_lock == LK_UNLOCKED) db_printf("UNLOCKED\n"); else if (lk->lk_lock & LK_SHARE) db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); else { td = lockmgr_xholder(lk); if (td == (struct thread *)LK_KERNPROC) db_printf("XLOCK: LK_KERNPROC\n"); else db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm); if (lockmgr_recursed(lk)) db_printf(" recursed: %d\n", lk->lk_recurse); } db_printf(" waiters: "); switch (lk->lk_lock & LK_ALL_WAITERS) { case LK_SHARED_WAITERS: db_printf("shared\n"); break; case LK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case LK_ALL_WAITERS: db_printf("shared and exclusive\n"); break; default: db_printf("none\n"); } db_printf(" spinners: "); if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS) db_printf("exclusive\n"); else db_printf("none\n"); } #endif Index: head/sys/kern/kern_lockstat.c =================================================================== --- head/sys/kern/kern_lockstat.c (revision 326270) +++ head/sys/kern/kern_lockstat.c (revision 326271) @@ -1,82 +1,84 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright 2008-2009 Stacey Son * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include SDT_PROVIDER_DEFINE(lockstat); SDT_PROBE_DEFINE1(lockstat, , , adaptive__acquire, "struct mtx *"); SDT_PROBE_DEFINE1(lockstat, , , adaptive__release, "struct mtx *"); SDT_PROBE_DEFINE2(lockstat, , , adaptive__spin, "struct mtx *", "uint64_t"); SDT_PROBE_DEFINE2(lockstat, , , adaptive__block, "struct mtx *", "uint64_t"); SDT_PROBE_DEFINE1(lockstat, , , spin__acquire, "struct mtx *"); SDT_PROBE_DEFINE1(lockstat, , , spin__release, "struct mtx *"); SDT_PROBE_DEFINE2(lockstat, , , spin__spin, "struct mtx *", "uint64_t"); SDT_PROBE_DEFINE2(lockstat, , , rw__acquire, "struct rwlock *", "int"); SDT_PROBE_DEFINE2(lockstat, , , rw__release, "struct rwlock *", "int"); SDT_PROBE_DEFINE5(lockstat, , , rw__block, "struct rwlock *", "uint64_t", "int", "int", "int"); SDT_PROBE_DEFINE2(lockstat, , , rw__spin, "struct rwlock *", "uint64_t"); SDT_PROBE_DEFINE1(lockstat, , , rw__upgrade, "struct rwlock *"); SDT_PROBE_DEFINE1(lockstat, , , rw__downgrade, "struct rwlock *"); SDT_PROBE_DEFINE2(lockstat, , , sx__acquire, "struct sx *", "int"); SDT_PROBE_DEFINE2(lockstat, , , sx__release, "struct sx *", "int"); SDT_PROBE_DEFINE5(lockstat, , , sx__block, "struct sx *", "uint64_t", "int", "int", "int"); SDT_PROBE_DEFINE2(lockstat, , , sx__spin, "struct sx *", "uint64_t"); SDT_PROBE_DEFINE1(lockstat, , , sx__upgrade, "struct sx *"); SDT_PROBE_DEFINE1(lockstat, , , sx__downgrade, "struct sx *"); SDT_PROBE_DEFINE2(lockstat, , , thread__spin, "struct mtx *", "uint64_t"); volatile int __read_frequently lockstat_enabled; uint64_t lockstat_nsecs(struct lock_object *lo) { struct bintime bt; uint64_t ns; if (!lockstat_enabled) return (0); if ((lo->lo_flags & LO_NOPROFILE) != 0) return (0); binuptime(&bt); ns = bt.sec * (uint64_t)1000000000; ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32; return (ns); } Index: head/sys/kern/kern_loginclass.c =================================================================== --- head/sys/kern/kern_loginclass.c (revision 326270) +++ head/sys/kern/kern_loginclass.c (revision 326271) @@ -1,256 +1,258 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Processes may set login class name using setloginclass(2). This * is usually done through call to setusercontext(3), by programs * such as login(1), based on information from master.passwd(5). Kernel * uses this information to enforce per-class resource limits. Current * login class can be determined using id(1). Login class is inherited * from the parent process during fork(2). If not set, it defaults * to "default". * * Code in this file implements setloginclass(2) and getloginclass(2) * system calls, and maintains class name storage and retrieval. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures"); LIST_HEAD(, loginclass) loginclasses; /* * Lock protecting loginclasses list. */ static struct rwlock loginclasses_lock; RW_SYSINIT(loginclasses_init, &loginclasses_lock, "loginclasses lock"); void loginclass_hold(struct loginclass *lc) { refcount_acquire(&lc->lc_refcount); } void loginclass_free(struct loginclass *lc) { int old; old = lc->lc_refcount; if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1)) return; rw_wlock(&loginclasses_lock); if (!refcount_release(&lc->lc_refcount)) { rw_wunlock(&loginclasses_lock); return; } racct_destroy(&lc->lc_racct); LIST_REMOVE(lc, lc_next); rw_wunlock(&loginclasses_lock); free(lc, M_LOGINCLASS); } /* * Look up a loginclass struct for the parameter name. * loginclasses_lock must be locked. * Increase refcount on loginclass struct returned. */ static struct loginclass * loginclass_lookup(const char *name) { struct loginclass *lc; rw_assert(&loginclasses_lock, RA_LOCKED); LIST_FOREACH(lc, &loginclasses, lc_next) if (strcmp(name, lc->lc_name) == 0) { loginclass_hold(lc); break; } return (lc); } /* * Return loginclass structure with a corresponding name. Not * performance critical, as it's used mainly by setloginclass(2), * which happens once per login session. Caller has to use * loginclass_free() on the returned value when it's no longer * needed. */ struct loginclass * loginclass_find(const char *name) { struct loginclass *lc, *new_lc; if (name[0] == '\0' || strlen(name) >= MAXLOGNAME) return (NULL); lc = curthread->td_ucred->cr_loginclass; if (strcmp(name, lc->lc_name) == 0) { loginclass_hold(lc); return (lc); } rw_rlock(&loginclasses_lock); lc = loginclass_lookup(name); rw_runlock(&loginclasses_lock); if (lc != NULL) return (lc); new_lc = malloc(sizeof(*new_lc), M_LOGINCLASS, M_ZERO | M_WAITOK); racct_create(&new_lc->lc_racct); refcount_init(&new_lc->lc_refcount, 1); strcpy(new_lc->lc_name, name); rw_wlock(&loginclasses_lock); /* * There's a chance someone created our loginclass while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate loginclass. */ if ((lc = loginclass_lookup(name)) == NULL) { LIST_INSERT_HEAD(&loginclasses, new_lc, lc_next); rw_wunlock(&loginclasses_lock); lc = new_lc; } else { rw_wunlock(&loginclasses_lock); racct_destroy(&new_lc->lc_racct); free(new_lc, M_LOGINCLASS); } return (lc); } /* * Get login class name. */ #ifndef _SYS_SYSPROTO_H_ struct getloginclass_args { char *namebuf; size_t namelen; }; #endif /* ARGSUSED */ int sys_getloginclass(struct thread *td, struct getloginclass_args *uap) { struct loginclass *lc; size_t lcnamelen; lc = td->td_ucred->cr_loginclass; lcnamelen = strlen(lc->lc_name) + 1; if (lcnamelen > uap->namelen) return (ERANGE); return (copyout(lc->lc_name, uap->namebuf, lcnamelen)); } /* * Set login class name. */ #ifndef _SYS_SYSPROTO_H_ struct setloginclass_args { const char *namebuf; }; #endif /* ARGSUSED */ int sys_setloginclass(struct thread *td, struct setloginclass_args *uap) { struct proc *p = td->td_proc; int error; char lcname[MAXLOGNAME]; struct loginclass *newlc; struct ucred *newcred, *oldcred; error = priv_check(td, PRIV_PROC_SETLOGINCLASS); if (error != 0) return (error); error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL); if (error != 0) return (error); newlc = loginclass_find(lcname); if (newlc == NULL) return (EINVAL); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_loginclass = newlc; proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif loginclass_free(oldcred->cr_loginclass); crfree(oldcred); return (0); } void loginclass_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct loginclass *lc; rw_rlock(&loginclasses_lock); if (pre != NULL) (pre)(); LIST_FOREACH(lc, &loginclasses, lc_next) (callback)(lc->lc_racct, arg2, arg3); if (post != NULL) (post)(); rw_runlock(&loginclasses_lock); } Index: head/sys/kern/kern_mbuf.c =================================================================== --- head/sys/kern/kern_mbuf.c (revision 326270) +++ head/sys/kern/kern_mbuf.c (revision 326271) @@ -1,944 +1,946 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Master Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Master Zone, * thus sharing backend Slab kegs with the Mbuf Master Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Master Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decommissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbufs; /* limits number of mbufs */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ static void tunable_mbinit(void *dummy) { quad_t realmem; /* * The default limit for all mbuf related memory is 1/2 of all * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); if (maxmbufmem > realmem / 4 * 3) maxmbufmem = realmem / 4 * 3; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); if (nmbclusters == 0) nmbclusters = maxmbufmem / MCLBYTES / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); if (nmbjumbop == 0) nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); if (nmbjumbo9 == 0) nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); if (nmbjumbo16 == 0) nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; /* * We need at least as many mbufs as we have clusters of * the various types added together. */ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); } SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbclusters = newnmbclusters; nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbop = newnmbjumbop; nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { if (newnmbjumbo9 > nmbjumbo9 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo9 = newnmbjumbo9; nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS) { int error, newnmbufs; newnmbufs = nmbufs; error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr && newnmbufs != nmbufs) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); EVENTHANDLER_INVOKE(nmbufs_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); /* * Initialize FreeBSD Network buffer allocation. */ static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif MSIZE - 1, UMA_ZONE_MAXBUCKET); if (nmbufs > 0) nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); uma_zone_set_maxaction(zone_mbuf, mb_reclaim); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); if (nmbclusters > 0) nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); uma_zone_set_maxaction(zone_clust, mb_reclaim); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); if (nmbjumbop > 0) nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); uma_zone_set_maxaction(zone_jumbop, mb_reclaim); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); if (nmbjumbo9 > 0) nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); if (nmbjumbo16 > 0) nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA * later pushes it back to VM). */ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); /* * UMA backend page allocator for the jumbo frame zones. * * Allocates kernel virtual memory that is backed by contiguous physical * pages. */ static void * mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } /* * Constructor for Mbuf master zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error; int flags; short type; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif args = (struct mb_args *)arg; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m = (struct mbuf *)mem; flags = args->flags; MPASS((flags & M_NOFREE) == 0); error = m_init(m, how, type, flags); return (error); } /* * The Mbuf master zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags; m = (struct mbuf *)mem; flags = (unsigned long)arg; KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) m_tag_delete_chain(m, NULL); #ifdef INVARIANTS trash_dtor(mem, size, arg); #endif } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); #ifdef INVARIANTS trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, * cause them to be woken up by draining the * packet zone. We are exposed to a race here * (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) zone_drain(zone_pack); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif m = (struct mbuf *)arg; if (m != NULL) { m->m_ext.ext_buf = (char *)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = m_gettype(size); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; } return (0); } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #ifdef INVARIANTS trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #ifdef INVARIANTS trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #ifdef INVARIANTS trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error, flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; MPASS((flags & M_NOFREE) == 0); #ifdef INVARIANTS trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif error = m_init(m, how, type, flags); /* m_ext is already initialized. */ m->m_data = m->m_ext.ext_buf; m->m_flags = (flags | M_EXT); return (error); } /* * This is the protocol drain routine. Called by UMA whenever any of the * mbuf zones is closed to its limit. * * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * reversal. */ static void mb_reclaim(uma_zone_t zone __unused, int pending __unused) { struct domain *dp; struct protosw *pr; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); } /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. */ void mb_free_ext(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; int freembuf; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* * Check if the header is embedded in the cluster. It is * important that we can't touch any of the mbuf fields * after we have freed the external storage, since mbuf * could have been embedded in it. For now, the mbufs * embedded into the cluster are always of type EXT_EXTREF, * and for this type we won't free the mref. */ if (m->m_flags & M_NOFREE) { freembuf = 0; KASSERT(m->m_ext.ext_type == EXT_EXTREF, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { switch (m->m_ext.ext_type) { case EXT_PACKET: /* The packet zone is special. */ if (*refcnt == 0) *refcnt = 1; uma_zfree(zone_pack, mref); break; case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBOP: uma_zfree(zone_jumbop, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBO9: uma_zfree(zone_jumbo9, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBO16: uma_zfree(zone_jumbo16, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_SFBUF: case EXT_NET_DRV: case EXT_MOD_TYPE: case EXT_DISPOSABLE: KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); uma_zfree(zone_mbuf, mref); break; case EXT_EXTREF: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); m->m_ext.ext_free(m); break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); } } if (freembuf && m != mref) uma_zfree(zone_mbuf, m); } /* * Official mbuf(9) allocation KPI for stack and drivers: * * m_get() - a single mbuf without any attachments, sys/mbuf.h. * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. * m_clget() - attach cluster to already allocated mbuf. * m_cljget() - attach jumbo cluster to already allocated mbuf. * m_get2() - allocate minimum mbuf that would fit size argument. * m_getm2() - allocate a chain of mbufs/clusters. * m_extadd() - attach external cluster to mbuf. * * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. * m_freem() - free chain of mbufs. */ int m_clget(struct mbuf *m, int how) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); /* * On a cluster allocation failure, drain the packet zone and retry, * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { zone_drain(zone_pack); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); return (m->m_flags & M_EXT); } /* * m_cljget() is different from m_clget() as it can allocate clusters without * attaching them to an mbuf. In that case the return value is the pointer * to the cluster of the requested size. If an mbuf was specified, it gets * the cluster attached to it and the return value can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; void *retval; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = NULL; } zone = m_getzone(size); retval = uma_zalloc_arg(zone, m, how); MBUF_PROBE4(m__cljget, m, how, size, retval); return (retval); } /* * m_get2() allocates minimum mbuf that would fit "size" argument. */ struct mbuf * m_get2(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; args.flags = flags; args.type = type; if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) return (uma_zalloc_arg(zone_mbuf, &args, how)); if (size <= MCLBYTES) return (uma_zalloc_arg(zone_pack, &args, how)); if (size > MJUMPAGESIZE) return (NULL); m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size == MCLBYTES) return m_getcl(how, type, flags); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); zone = m_getzone(size); n = uma_zalloc_arg(zone, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one but still return the top of the newly allocated * chain. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) { struct mbuf *mb, *nm = NULL, *mtail = NULL; KASSERT(len >= 0, ("%s: len is < 0", __func__)); /* Validate flags. */ flags &= (M_PKTHDR | M_EOR); /* Packet header mbuf must be first in chain. */ if ((flags & M_PKTHDR) && m != NULL) flags &= ~M_PKTHDR; /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { if (len > MCLBYTES) mb = m_getjcl(how, type, (flags & M_PKTHDR), MJUMPAGESIZE); else if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) mb = m_gethdr(how, type); else mb = m_get(how, type); /* Fail the whole operation if one mbuf can't be allocated. */ if (mb == NULL) { if (nm != NULL) m_freem(nm); return (NULL); } /* Book keeping. */ len -= M_SIZE(mb); if (mtail != NULL) mtail->m_next = mb; else nm = mb; mtail = mb; flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } if (flags & M_EOR) mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ /* If mbuf was supplied, append new chain to the end of it. */ if (m != NULL) { for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) ; mtail->m_next = nm; mtail->m_flags &= ~M_EOR; } else m = nm; return (m); } /*- * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. * * Arguments: * mb The existing mbuf to which to attach the provided buffer. * buf The address of the provided external storage buffer. * size The size of the provided buffer. * freef A pointer to a routine that is responsible for freeing the * provided external storage buffer. * args A pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * flags Any other flags to be passed to the provided mbuf. * type The type that the external storage buffer should be * labeled with. * * Returns: * Nothing. */ void m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, void *arg1, void *arg2, int flags, int type) { KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_arg1 = arg1; mb->m_ext.ext_arg2 = arg2; mb->m_ext.ext_type = type; if (type != EXT_EXTREF) { mb->m_ext.ext_count = 1; mb->m_ext.ext_flags = EXT_FLAG_EMBREF; } else mb->m_ext.ext_flags = 0; } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. */ void m_freem(struct mbuf *mb) { MBUF_PROBE1(m__freem, mb); while (mb != NULL) mb = m_free(mb); } Index: head/sys/kern/kern_module.c =================================================================== --- head/sys/kern/kern_module.c (revision 326270) +++ head/sys/kern/kern_module.c (revision 326271) @@ -1,519 +1,521 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1997 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_compat.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MODULE, "module", "module data structures"); struct module { TAILQ_ENTRY(module) link; /* chain together all modules */ TAILQ_ENTRY(module) flink; /* all modules in a file */ struct linker_file *file; /* file which contains this module */ int refs; /* reference count */ int id; /* unique id number */ char *name; /* module name */ modeventhand_t handler; /* event handler */ void *arg; /* argument for handler */ modspecific_t data; /* module specific data */ }; #define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg) static TAILQ_HEAD(modulelist, module) modules; struct sx modules_sx; static int nextid = 1; static void module_shutdown(void *, int); static int modevent_nop(module_t mod, int what, void *arg) { switch(what) { case MOD_LOAD: return (0); case MOD_UNLOAD: return (EBUSY); default: return (EOPNOTSUPP); } } static void module_init(void *arg) { sx_init(&modules_sx, "module subsystem sx lock"); TAILQ_INIT(&modules); EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0); static void module_shutdown(void *arg1, int arg2) { module_t mod; if (arg2 & RB_NOSYNC) return; mtx_lock(&Giant); MOD_SLOCK; TAILQ_FOREACH_REVERSE(mod, &modules, modulelist, link) MOD_EVENT(mod, MOD_SHUTDOWN); MOD_SUNLOCK; mtx_unlock(&Giant); } void module_register_init(const void *arg) { const moduledata_t *data = (const moduledata_t *)arg; int error; module_t mod; mtx_lock(&Giant); MOD_SLOCK; mod = module_lookupbyname(data->name); if (mod == NULL) panic("module_register_init: module named %s not found\n", data->name); MOD_SUNLOCK; error = MOD_EVENT(mod, MOD_LOAD); if (error) { MOD_EVENT(mod, MOD_UNLOAD); MOD_XLOCK; module_release(mod); MOD_XUNLOCK; printf("module_register_init: MOD_LOAD (%s, %p, %p) error" " %d\n", data->name, (void *)data->evhand, data->priv, error); } else { MOD_XLOCK; if (mod->file) { /* * Once a module is successfully loaded, move * it to the head of the module list for this * linker file. This resorts the list so that * when the kernel linker iterates over the * modules to unload them, it will unload them * in the reverse order they were loaded. */ TAILQ_REMOVE(&mod->file->modules, mod, flink); TAILQ_INSERT_HEAD(&mod->file->modules, mod, flink); } MOD_XUNLOCK; } mtx_unlock(&Giant); } int module_register(const moduledata_t *data, linker_file_t container) { size_t namelen; module_t newmod; MOD_XLOCK; newmod = module_lookupbyname(data->name); if (newmod != NULL) { MOD_XUNLOCK; printf("%s: cannot register %s from %s; already loaded from %s\n", __func__, data->name, container->filename, newmod->file->filename); return (EEXIST); } namelen = strlen(data->name) + 1; newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK); newmod->refs = 1; newmod->id = nextid++; newmod->name = (char *)(newmod + 1); strcpy(newmod->name, data->name); newmod->handler = data->evhand ? data->evhand : modevent_nop; newmod->arg = data->priv; bzero(&newmod->data, sizeof(newmod->data)); TAILQ_INSERT_TAIL(&modules, newmod, link); if (container) TAILQ_INSERT_TAIL(&container->modules, newmod, flink); newmod->file = container; MOD_XUNLOCK; return (0); } void module_reference(module_t mod) { MOD_XLOCK_ASSERT; MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs)); mod->refs++; } void module_release(module_t mod) { MOD_XLOCK_ASSERT; if (mod->refs <= 0) panic("module_release: bad reference count"); MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs)); mod->refs--; if (mod->refs == 0) { TAILQ_REMOVE(&modules, mod, link); if (mod->file) TAILQ_REMOVE(&mod->file->modules, mod, flink); free(mod, M_MODULE); } } module_t module_lookupbyname(const char *name) { module_t mod; int err; MOD_LOCK_ASSERT; TAILQ_FOREACH(mod, &modules, link) { err = strcmp(mod->name, name); if (err == 0) return (mod); } return (NULL); } module_t module_lookupbyid(int modid) { module_t mod; MOD_LOCK_ASSERT; TAILQ_FOREACH(mod, &modules, link) if (mod->id == modid) return(mod); return (NULL); } int module_quiesce(module_t mod) { int error; mtx_lock(&Giant); error = MOD_EVENT(mod, MOD_QUIESCE); mtx_unlock(&Giant); if (error == EOPNOTSUPP || error == EINVAL) error = 0; return (error); } int module_unload(module_t mod) { int error; mtx_lock(&Giant); error = MOD_EVENT(mod, MOD_UNLOAD); mtx_unlock(&Giant); return (error); } int module_getid(module_t mod) { MOD_LOCK_ASSERT; return (mod->id); } module_t module_getfnext(module_t mod) { MOD_LOCK_ASSERT; return (TAILQ_NEXT(mod, flink)); } const char * module_getname(module_t mod) { MOD_LOCK_ASSERT; return (mod->name); } void module_setspecific(module_t mod, modspecific_t *datap) { MOD_XLOCK_ASSERT; mod->data = *datap; } linker_file_t module_file(module_t mod) { return (mod->file); } /* * Syscalls. */ int sys_modnext(struct thread *td, struct modnext_args *uap) { module_t mod; int error = 0; td->td_retval[0] = -1; MOD_SLOCK; if (uap->modid == 0) { mod = TAILQ_FIRST(&modules); if (mod) td->td_retval[0] = mod->id; else error = ENOENT; goto done2; } mod = module_lookupbyid(uap->modid); if (mod == NULL) { error = ENOENT; goto done2; } if (TAILQ_NEXT(mod, link)) td->td_retval[0] = TAILQ_NEXT(mod, link)->id; else td->td_retval[0] = 0; done2: MOD_SUNLOCK; return (error); } int sys_modfnext(struct thread *td, struct modfnext_args *uap) { module_t mod; int error; td->td_retval[0] = -1; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { error = ENOENT; } else { error = 0; if (TAILQ_NEXT(mod, flink)) td->td_retval[0] = TAILQ_NEXT(mod, flink)->id; else td->td_retval[0] = 0; } MOD_SUNLOCK; return (error); } struct module_stat_v1 { int version; /* set to sizeof(struct module_stat) */ char name[MAXMODNAME]; int refs; int id; }; int sys_modstat(struct thread *td, struct modstat_args *uap) { module_t mod; modspecific_t data; int error = 0; int id, namelen, refs, version; struct module_stat *stat; char *name; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { MOD_SUNLOCK; return (ENOENT); } id = mod->id; refs = mod->refs; name = mod->name; data = mod->data; MOD_SUNLOCK; stat = uap->stat; /* * Check the version of the user's structure. */ if ((error = copyin(&stat->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct module_stat_v1) && version != sizeof(struct module_stat)) return (EINVAL); namelen = strlen(mod->name) + 1; if (namelen > MAXMODNAME) namelen = MAXMODNAME; if ((error = copyout(name, &stat->name[0], namelen)) != 0) return (error); if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0) return (error); if ((error = copyout(&id, &stat->id, sizeof(int))) != 0) return (error); /* * >v1 stat includes module data. */ if (version == sizeof(struct module_stat)) if ((error = copyout(&data, &stat->data, sizeof(data))) != 0) return (error); td->td_retval[0] = 0; return (error); } int sys_modfind(struct thread *td, struct modfind_args *uap) { int error = 0; char name[MAXMODNAME]; module_t mod; if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0) return (error); MOD_SLOCK; mod = module_lookupbyname(name); if (mod == NULL) error = ENOENT; else td->td_retval[0] = module_getid(mod); MOD_SUNLOCK; return (error); } MODULE_VERSION(kernel, __FreeBSD_version); #ifdef COMPAT_FREEBSD32 #include #include #include #include #include typedef union modspecific32 { int intval; uint32_t uintval; int longval; uint32_t ulongval; } modspecific32_t; struct module_stat32 { int version; char name[MAXMODNAME]; int refs; int id; modspecific32_t data; }; int freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap) { module_t mod; modspecific32_t data32; int error = 0; int id, namelen, refs, version; struct module_stat32 *stat32; char *name; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { MOD_SUNLOCK; return (ENOENT); } id = mod->id; refs = mod->refs; name = mod->name; CP(mod->data, data32, intval); CP(mod->data, data32, uintval); CP(mod->data, data32, longval); CP(mod->data, data32, ulongval); MOD_SUNLOCK; stat32 = uap->stat; if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct module_stat_v1) && version != sizeof(struct module_stat32)) return (EINVAL); namelen = strlen(mod->name) + 1; if (namelen > MAXMODNAME) namelen = MAXMODNAME; if ((error = copyout(name, &stat32->name[0], namelen)) != 0) return (error); if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0) return (error); if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0) return (error); /* * >v1 stat includes module data. */ if (version == sizeof(struct module_stat32)) if ((error = copyout(&data32, &stat32->data, sizeof(data32))) != 0) return (error); td->td_retval[0] = 0; return (error); } #endif Index: head/sys/kern/kern_mtxpool.c =================================================================== --- head/sys/kern/kern_mtxpool.c (revision 326270) +++ head/sys/kern/kern_mtxpool.c (revision 326271) @@ -1,188 +1,190 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2001 Matthew Dillon. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Mutex pool routines. These routines are designed to be used as short * term leaf mutexes (e.g. the last mutex you might acquire other then * calling msleep()). They operate using a shared pool. A mutex is chosen * from the pool based on the supplied pointer (which may or may not be * valid). * * Advantages: * - no structural overhead. Mutexes can be associated with structures * without adding bloat to the structures. * - mutexes can be obtained for invalid pointers, useful when uses * mutexes to interlock destructor ops. * - no initialization/destructor overhead. * - can be used with msleep. * * Disadvantages: * - should generally only be used as leaf mutexes. * - pool/pool dependency ordering cannot be depended on. * - possible L1 cache mastersip contention between cpus. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool"); /* Pool sizes must be a power of two */ #ifndef MTX_POOL_SLEEP_SIZE #define MTX_POOL_SLEEP_SIZE 128 #endif struct mtxpool_header { int mtxpool_size; int mtxpool_mask; int mtxpool_shift; int mtxpool_next; }; struct mtx_pool { struct mtxpool_header mtx_pool_header; struct mtx mtx_pool_ary[1]; }; #define mtx_pool_size mtx_pool_header.mtxpool_size #define mtx_pool_mask mtx_pool_header.mtxpool_mask #define mtx_pool_shift mtx_pool_header.mtxpool_shift #define mtx_pool_next mtx_pool_header.mtxpool_next struct mtx_pool *mtxpool_sleep; #if UINTPTR_MAX == UINT64_MAX /* 64 bits */ # define POINTER_BITS 64 # define HASH_MULTIPLIER 11400714819323198485u /* (2^64)*(sqrt(5)-1)/2 */ #else /* assume 32 bits */ # define POINTER_BITS 32 # define HASH_MULTIPLIER 2654435769u /* (2^32)*(sqrt(5)-1)/2 */ #endif /* * Return the (shared) pool mutex associated with the specified address. * The returned mutex is a leaf level mutex, meaning that if you obtain it * you cannot obtain any other mutexes until you release it. You can * legally msleep() on the mutex. */ struct mtx * mtx_pool_find(struct mtx_pool *pool, void *ptr) { int p; KASSERT(pool != NULL, ("_mtx_pool_find(): null pool")); /* * Fibonacci hash, see Knuth's * _Art of Computer Programming, Volume 3 / Sorting and Searching_ */ p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) & pool->mtx_pool_mask; return (&pool->mtx_pool_ary[p]); } static void mtx_pool_initialize(struct mtx_pool *pool, const char *mtx_name, int pool_size, int opts) { int i, maskbits; pool->mtx_pool_size = pool_size; pool->mtx_pool_mask = pool_size - 1; for (i = 1, maskbits = 0; (i & pool_size) == 0; i = i << 1) maskbits++; pool->mtx_pool_shift = POINTER_BITS - maskbits; pool->mtx_pool_next = 0; for (i = 0; i < pool_size; ++i) mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts); } struct mtx_pool * mtx_pool_create(const char *mtx_name, int pool_size, int opts) { struct mtx_pool *pool; if (pool_size <= 0 || !powerof2(pool_size)) { printf("WARNING: %s pool size is not a power of 2.\n", mtx_name); pool_size = 128; } pool = malloc(sizeof (struct mtx_pool) + ((pool_size - 1) * sizeof (struct mtx)), M_MTXPOOL, M_WAITOK | M_ZERO); mtx_pool_initialize(pool, mtx_name, pool_size, opts); return pool; } void mtx_pool_destroy(struct mtx_pool **poolp) { int i; struct mtx_pool *pool = *poolp; for (i = pool->mtx_pool_size - 1; i >= 0; --i) mtx_destroy(&pool->mtx_pool_ary[i]); free(pool, M_MTXPOOL); *poolp = NULL; } static void mtx_pool_setup_dynamic(void *dummy __unused) { mtxpool_sleep = mtx_pool_create("sleep mtxpool", MTX_POOL_SLEEP_SIZE, MTX_DEF); } /* * Obtain a (shared) mutex from the pool. The returned mutex is a leaf * level mutex, meaning that if you obtain it you cannot obtain any other * mutexes until you release it. You can legally msleep() on the mutex. */ struct mtx * mtx_pool_alloc(struct mtx_pool *pool) { int i; KASSERT(pool != NULL, ("mtx_pool_alloc(): null pool")); /* * mtx_pool_next is unprotected against multiple accesses, * but simultaneous access by two CPUs should not be very * harmful. */ i = pool->mtx_pool_next; pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask; return (&pool->mtx_pool_ary[i]); } SYSINIT(mtxpooli2, SI_SUB_MTX_POOL_DYNAMIC, SI_ORDER_FIRST, mtx_pool_setup_dynamic, NULL); Index: head/sys/kern/kern_mutex.c =================================================================== --- head/sys/kern/kern_mutex.c (revision 326270) +++ head/sys/kern/kern_mutex.c (revision 326271) @@ -1,1270 +1,1272 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Machine independent bits of mutex implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_adaptive_mutexes.h" #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES) #define ADAPTIVE_MUTEXES #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , lock, failed); #endif /* * Return the mutex address when the lock cookie address is provided. * This functionality assumes that struct mtx* have a member named mtx_lock. */ #define mtxlock2mtx(c) (__containerof(c, struct mtx, mtx_lock)) /* * Internal utility macros. */ #define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) #define mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED) static void assert_mtx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_mtx(const struct lock_object *lock); #endif static void lock_mtx(struct lock_object *lock, uintptr_t how); static void lock_spin(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_mtx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_mtx(struct lock_object *lock); static uintptr_t unlock_spin(struct lock_object *lock); /* * Lock classes for sleep and spin mutexes. */ struct lock_class lock_class_mtx_sleep = { .lc_name = "sleep mutex", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE, .lc_assert = assert_mtx, #ifdef DDB .lc_ddb_show = db_show_mtx, #endif .lc_lock = lock_mtx, .lc_unlock = unlock_mtx, #ifdef KDTRACE_HOOKS .lc_owner = owner_mtx, #endif }; struct lock_class lock_class_mtx_spin = { .lc_name = "spin mutex", .lc_flags = LC_SPINLOCK | LC_RECURSABLE, .lc_assert = assert_mtx, #ifdef DDB .lc_ddb_show = db_show_mtx, #endif .lc_lock = lock_spin, .lc_unlock = unlock_spin, #ifdef KDTRACE_HOOKS .lc_owner = owner_mtx, #endif }; #ifdef ADAPTIVE_MUTEXES static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging"); static struct lock_delay_config __read_frequently mtx_delay; SYSCTL_INT(_debug_mtx, OID_AUTO, delay_base, CTLFLAG_RW, &mtx_delay.base, 0, ""); SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_delay.max, 0, ""); LOCK_DELAY_SYSINIT_DEFAULT(mtx_delay); #endif static SYSCTL_NODE(_debug, OID_AUTO, mtx_spin, CTLFLAG_RD, NULL, "mtx spin debugging"); static struct lock_delay_config __read_frequently mtx_spin_delay; SYSCTL_INT(_debug_mtx_spin, OID_AUTO, delay_base, CTLFLAG_RW, &mtx_spin_delay.base, 0, ""); SYSCTL_INT(_debug_mtx_spin, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_spin_delay.max, 0, ""); LOCK_DELAY_SYSINIT_DEFAULT(mtx_spin_delay); /* * System-wide mutexes */ struct mtx blocked_lock; struct mtx __exclusive_cache_line Giant; void assert_mtx(const struct lock_object *lock, int what) { mtx_assert((const struct mtx *)lock, what); } void lock_mtx(struct lock_object *lock, uintptr_t how) { mtx_lock((struct mtx *)lock); } void lock_spin(struct lock_object *lock, uintptr_t how) { panic("spin locks can only use msleep_spin"); } uintptr_t unlock_mtx(struct lock_object *lock) { struct mtx *m; m = (struct mtx *)lock; mtx_assert(m, MA_OWNED | MA_NOTRECURSED); mtx_unlock(m); return (0); } uintptr_t unlock_spin(struct lock_object *lock) { panic("spin locks can only use msleep_spin"); } #ifdef KDTRACE_HOOKS int owner_mtx(const struct lock_object *lock, struct thread **owner) { const struct mtx *m; uintptr_t x; m = (const struct mtx *)lock; x = m->mtx_lock; *owner = (struct thread *)(x & ~MTX_FLAGMASK); return (*owner != NULL); } #endif /* * Function versions of the inlined __mtx_* macros. These are used by * modules and can also be called from assembly language if needed. */ void __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; uintptr_t tid, v; m = mtxlock2mtx(c); KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_lock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); tid = (uintptr_t)curthread; v = MTX_UNOWNED; if (!_mtx_obtain_lock_fetch(m, &v, tid)) _mtx_lock_sleep(m, v, opts, file, line); else LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, 0, 0, file, line); LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } void __mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_unlock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); #ifdef LOCK_PROFILING __mtx_unlock_sleep(c, (uintptr_t)curthread, opts, file, line); #else __mtx_unlock(m, curthread, opts, file, line); #endif TD_LOCKS_DEC(curthread); } void __mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; #ifdef SMP uintptr_t tid, v; #endif m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_lock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m)) KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0, ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); opts &= ~MTX_RECURSE; WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); #ifdef SMP spinlock_enter(); tid = (uintptr_t)curthread; v = MTX_UNOWNED; if (!_mtx_obtain_lock_fetch(m, &v, tid)) _mtx_lock_spin(m, v, opts, file, line); else LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m, 0, 0, file, line); #else __mtx_lock_spin(m, curthread, opts, file, line); #endif LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); } int __mtx_trylock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return (1); m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_trylock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_trylock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); KASSERT((opts & MTX_RECURSE) == 0, ("mtx_trylock_spin: unsupp. opt MTX_RECURSE on mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); if (__mtx_trylock_spin(m, curthread, opts, file, line)) { LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 1, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); return (1); } LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 0, file, line); return (0); } void __mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_unlock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); __mtx_unlock_spin(m); } /* * The important part of mtx_trylock{,_flags}() * Tries to acquire lock `m.' If this function is called on a mutex that * is already owned, it will recursively acquire the lock. */ int _mtx_trylock_flags_int(struct mtx *m, int opts LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t tid, v; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int rval; bool recursed; td = curthread; tid = (uintptr_t)td; if (SCHEDULER_STOPPED_TD(td)) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td), ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_trylock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); rval = 1; recursed = false; v = MTX_UNOWNED; for (;;) { if (_mtx_obtain_lock_fetch(m, &v, tid)) break; if (v == MTX_UNOWNED) continue; if (v == tid && ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0)) { m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); recursed = true; break; } rval = 0; break; } opts &= ~MTX_RECURSE; LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line); if (rval) { WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); TD_LOCKS_INC(curthread); if (!recursed) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested, waittime, file, line); } return (rval); } int _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; m = mtxlock2mtx(c); return (_mtx_trylock_flags_int(m, opts LOCK_FILE_LINE_ARG)); } /* * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * * We call this if the lock is either contested (i.e. we need to go to * sleep waiting for it), or if we need to recurse on it. */ #if LOCK_DEBUG > 0 void __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t v, int opts, const char *file, int line) #else void __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t v) #endif { struct thread *td; struct mtx *m; struct turnstile *ts; uintptr_t tid; struct thread *owner; #ifdef KTR int cont_logged = 0; #endif #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) int doing_lockprof; #endif td = curthread; tid = (uintptr_t)td; if (SCHEDULER_STOPPED_TD(td)) return; #if defined(ADAPTIVE_MUTEXES) lock_delay_arg_init(&lda, &mtx_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init(&lda, NULL); #endif m = mtxlock2mtx(c); if (__predict_false(v == MTX_UNOWNED)) v = MTX_READ_VALUE(m); if (__predict_false(lv_mtx_owner(v) == td)) { KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0, ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); #if LOCK_DEBUG > 0 opts &= ~MTX_RECURSE; #endif m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); return; } #if LOCK_DEBUG > 0 opts &= ~MTX_RECURSE; #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR4(KTR_LOCK, "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d", m->lock_object.lo_name, (void *)m->mtx_lock, file, line); #ifdef LOCK_PROFILING doing_lockprof = 1; #elif defined(KDTRACE_HOOKS) doing_lockprof = lockstat_enabled; if (__predict_false(doing_lockprof)) all_time -= lockstat_nsecs(&m->lock_object); #endif for (;;) { if (v == MTX_UNOWNED) { if (_mtx_obtain_lock_fetch(m, &v, tid)) break; continue; } #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_MUTEXES /* * If the owner is running on another CPU, spin until the * owner stops running or the state of the lock changes. */ owner = lv_mtx_owner(v); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&m->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, m, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "spinning", "lockname:\"%s\"", m->lock_object.lo_name); do { lock_delay(&lda); v = MTX_READ_VALUE(m); owner = lv_mtx_owner(v); } while (v != MTX_UNOWNED && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); continue; } #endif ts = turnstile_trywait(&m->lock_object); v = MTX_READ_VALUE(m); /* * Check if the lock has been released while spinning for * the turnstile chain lock. */ if (v == MTX_UNOWNED) { turnstile_cancel(ts); continue; } #ifdef ADAPTIVE_MUTEXES /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ owner = lv_mtx_owner(v); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } #endif /* * If the mutex isn't already contested and a failure occurs * setting the contested bit, the mutex was either released * or the state of the MTX_RECURSED bit changed. */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) { turnstile_cancel(ts); v = MTX_READ_VALUE(m); continue; } /* * We definitely must sleep for this lock. */ mtx_assert(m, MA_NOTOWNED); #ifdef KTR if (!cont_logged) { CTR6(KTR_CONTENTION, "contention: %p at %s:%d wants %s, taken by %s:%d", (void *)tid, file, line, m->lock_object.lo_name, WITNESS_FILE(&m->lock_object), WITNESS_LINE(&m->lock_object)); cont_logged = 1; } #endif /* * Block on the turnstile. */ #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&m->lock_object); #endif #ifndef ADAPTIVE_MUTEXES owner = mtx_owner(m); #endif MPASS(owner == mtx_owner(m)); turnstile_wait(ts, owner, TS_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&m->lock_object); sleep_cnt++; #endif v = MTX_READ_VALUE(m); } #ifdef KTR if (cont_logged) { CTR4(KTR_CONTENTION, "contention end: %s acquired by %p at %s:%d", m->lock_object.lo_name, (void *)tid, file, line); } #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&m->lock_object); #endif LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested, waittime, file, line); #ifdef KDTRACE_HOOKS if (sleep_time) LOCKSTAT_RECORD1(adaptive__block, m, sleep_time); /* * Only record the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD1(adaptive__spin, m, all_time - sleep_time); #endif } static void _mtx_lock_spin_failed(struct mtx *m) { struct thread *td; td = mtx_owner(m); /* If the mutex is unlocked, try again. */ if (td == NULL) return; printf( "spin lock %p (%s) held by %p (tid %d) too long\n", m, m->lock_object.lo_name, td, td->td_tid); #ifdef WITNESS witness_display_spinlock(&m->lock_object, td, printf); #endif panic("spin lock held too long"); } #ifdef SMP /* * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock. * * This is only called if we need to actually spin for the lock. Recursion * is handled inline. */ #if LOCK_DEBUG > 0 void _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t v, int opts, const char *file, int line) #else void _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t v) #endif { struct mtx *m; struct lock_delay_arg lda; uintptr_t tid; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #ifdef KDTRACE_HOOKS int64_t spin_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) int doing_lockprof; #endif tid = (uintptr_t)curthread; m = mtxlock2mtx(c); if (__predict_false(v == MTX_UNOWNED)) v = MTX_READ_VALUE(m); if (__predict_false(v == tid)) { m->mtx_recurse++; return; } if (SCHEDULER_STOPPED()) return; lock_delay_arg_init(&lda, &mtx_spin_delay); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "spinning", "lockname:\"%s\"", m->lock_object.lo_name); #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); #ifdef LOCK_PROFILING doing_lockprof = 1; #elif defined(KDTRACE_HOOKS) doing_lockprof = lockstat_enabled; if (__predict_false(doing_lockprof)) spin_time -= lockstat_nsecs(&m->lock_object); #endif for (;;) { if (v == MTX_UNOWNED) { if (_mtx_obtain_lock_fetch(m, &v, tid)) break; continue; } /* Give interrupts a chance while we spin. */ spinlock_exit(); do { if (lda.spin_cnt < 10000000) { lock_delay(&lda); } else { lda.spin_cnt++; if (lda.spin_cnt < 60000000 || kdb_active || panicstr != NULL) DELAY(1); else _mtx_lock_spin_failed(m); cpu_spinwait(); } v = MTX_READ_VALUE(m); } while (v != MTX_UNOWNED); spinlock_enter(); } if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS spin_time += lockstat_nsecs(&m->lock_object); #endif LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m, contested, waittime, file, line); #ifdef KDTRACE_HOOKS if (lda.spin_cnt != 0) LOCKSTAT_RECORD1(spin__spin, m, spin_time); #endif } #endif /* SMP */ #ifdef INVARIANTS static void thread_lock_validate(struct mtx *m, int opts, const char *file, int line) { KASSERT(m->mtx_lock != MTX_DESTROYED, ("thread_lock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("thread_lock() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m)) KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0, ("thread_lock: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); } #else #define thread_lock_validate(m, opts, file, line) do { } while (0) #endif #ifndef LOCK_PROFILING #if LOCK_DEBUG > 0 void _thread_lock(struct thread *td, int opts, const char *file, int line) #else void _thread_lock(struct thread *td) #endif { struct mtx *m; uintptr_t tid, v; tid = (uintptr_t)curthread; if (__predict_false(LOCKSTAT_PROFILE_ENABLED(spin__acquire))) goto slowpath_noirq; spinlock_enter(); m = td->td_lock; thread_lock_validate(m, 0, file, line); v = MTX_READ_VALUE(m); if (__predict_true(v == MTX_UNOWNED)) { if (__predict_false(!_mtx_obtain_lock(m, tid))) goto slowpath_unlocked; } else if (v == tid) { m->mtx_recurse++; } else goto slowpath_unlocked; if (__predict_true(m == td->td_lock)) { WITNESS_LOCK(&m->lock_object, LOP_EXCLUSIVE, file, line); return; } if (m->mtx_recurse != 0) m->mtx_recurse--; else _mtx_release_lock_quick(m); slowpath_unlocked: spinlock_exit(); slowpath_noirq: #if LOCK_DEBUG > 0 thread_lock_flags_(td, opts, file, line); #else thread_lock_flags_(td, 0, 0, 0); #endif } #endif void thread_lock_flags_(struct thread *td, int opts, const char *file, int line) { struct mtx *m; uintptr_t tid, v; struct lock_delay_arg lda; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #ifdef KDTRACE_HOOKS int64_t spin_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) int doing_lockprof = 1; #endif tid = (uintptr_t)curthread; if (SCHEDULER_STOPPED()) { /* * Ensure that spinlock sections are balanced even when the * scheduler is stopped, since we may otherwise inadvertently * re-enable interrupts while dumping core. */ spinlock_enter(); return; } lock_delay_arg_init(&lda, &mtx_spin_delay); #ifdef LOCK_PROFILING doing_lockprof = 1; #elif defined(KDTRACE_HOOKS) doing_lockprof = lockstat_enabled; if (__predict_false(doing_lockprof)) spin_time -= lockstat_nsecs(&td->td_lock->lock_object); #endif for (;;) { retry: v = MTX_UNOWNED; spinlock_enter(); m = td->td_lock; thread_lock_validate(m, opts, file, line); for (;;) { if (_mtx_obtain_lock_fetch(m, &v, tid)) break; if (v == MTX_UNOWNED) continue; if (v == tid) { m->mtx_recurse++; break; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); /* Give interrupts a chance while we spin. */ spinlock_exit(); do { if (lda.spin_cnt < 10000000) { lock_delay(&lda); } else { lda.spin_cnt++; if (lda.spin_cnt < 60000000 || kdb_active || panicstr != NULL) DELAY(1); else _mtx_lock_spin_failed(m); cpu_spinwait(); } if (m != td->td_lock) goto retry; v = MTX_READ_VALUE(m); } while (v != MTX_UNOWNED); spinlock_enter(); } if (m == td->td_lock) break; __mtx_unlock_spin(m); /* does spinlock_exit() */ } LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS spin_time += lockstat_nsecs(&m->lock_object); #endif if (m->mtx_recurse == 0) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m, contested, waittime, file, line); #ifdef KDTRACE_HOOKS if (lda.spin_cnt != 0) LOCKSTAT_RECORD1(thread__spin, m, spin_time); #endif } struct mtx * thread_lock_block(struct thread *td) { struct mtx *lock; THREAD_LOCK_ASSERT(td, MA_OWNED); lock = td->td_lock; td->td_lock = &blocked_lock; mtx_unlock_spin(lock); return (lock); } void thread_lock_unblock(struct thread *td, struct mtx *new) { mtx_assert(new, MA_OWNED); MPASS(td->td_lock == &blocked_lock); atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new); } void thread_lock_set(struct thread *td, struct mtx *new) { struct mtx *lock; mtx_assert(new, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); lock = td->td_lock; td->td_lock = new; mtx_unlock_spin(lock); } /* * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * * We are only called here if the lock is recursed, contested (i.e. we * need to wake up a blocked thread) or lockstat probe is active. */ #if LOCK_DEBUG > 0 void __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v, int opts, const char *file, int line) #else void __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v) #endif { struct mtx *m; struct turnstile *ts; uintptr_t tid; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; m = mtxlock2mtx(c); if (__predict_false(v == tid)) v = MTX_READ_VALUE(m); if (__predict_false(v & MTX_RECURSED)) { if (--(m->mtx_recurse) == 0) atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); return; } LOCKSTAT_PROFILE_RELEASE_LOCK(adaptive__release, m); if (v == tid && _mtx_release_lock(m, tid)) return; /* * We have to lock the chain before the turnstile so this turnstile * can be removed from the hash list if it is empty. */ turnstile_chain_lock(&m->lock_object); _mtx_release_lock_quick(m); ts = turnstile_lookup(&m->lock_object); MPASS(ts != NULL); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE); /* * This turnstile is now no longer associated with the mutex. We can * unlock the chain lock so a new turnstile may take it's place. */ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); turnstile_chain_unlock(&m->lock_object); } /* * All the unlocking of MTX_SPIN locks is done inline. * See the __mtx_unlock_spin() macro for the details. */ /* * The backing function for the INVARIANTS-enabled mtx_assert() */ #ifdef INVARIANT_SUPPORT void __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct mtx *m; if (panicstr != NULL || dumping || SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); switch (what) { case MA_OWNED: case MA_OWNED | MA_RECURSED: case MA_OWNED | MA_NOTRECURSED: if (!mtx_owned(m)) panic("mutex %s not owned at %s:%d", m->lock_object.lo_name, file, line); if (mtx_recursed(m)) { if ((what & MA_NOTRECURSED) != 0) panic("mutex %s recursed at %s:%d", m->lock_object.lo_name, file, line); } else if ((what & MA_RECURSED) != 0) { panic("mutex %s unrecursed at %s:%d", m->lock_object.lo_name, file, line); } break; case MA_NOTOWNED: if (mtx_owned(m)) panic("mutex %s owned at %s:%d", m->lock_object.lo_name, file, line); break; default: panic("unknown mtx_assert at %s:%d", file, line); } } #endif /* * General init routine used by the MTX_SYSINIT() macro. */ void mtx_sysinit(void *arg) { struct mtx_args *margs = arg; mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts); } /* * Mutex initialization routine; initialize lock `m' of type contained in * `opts' with options contained in `opts' and name `name.' The optional * lock type `type' is used as a general lock category name for use with * witness. */ void _mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts) { struct mtx *m; struct lock_class *class; int flags; m = mtxlock2mtx(c); MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE | MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE | MTX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock, ("%s: mtx_lock not aligned for %s: %p", __func__, name, &m->mtx_lock)); /* Determine lock class and lock flags. */ if (opts & MTX_SPIN) class = &lock_class_mtx_spin; else class = &lock_class_mtx_sleep; flags = 0; if (opts & MTX_QUIET) flags |= LO_QUIET; if (opts & MTX_RECURSE) flags |= LO_RECURSABLE; if ((opts & MTX_NOWITNESS) == 0) flags |= LO_WITNESS; if (opts & MTX_DUPOK) flags |= LO_DUPOK; if (opts & MTX_NOPROFILE) flags |= LO_NOPROFILE; if (opts & MTX_NEW) flags |= LO_NEW; /* Initialize mutex. */ lock_init(&m->lock_object, class, name, type, flags); m->mtx_lock = MTX_UNOWNED; m->mtx_recurse = 0; } /* * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be * passed in as a flag here because if the corresponding mtx_init() was * called with MTX_QUIET set, then it will already be set in the mutex's * flags. */ void _mtx_destroy(volatile uintptr_t *c) { struct mtx *m; m = mtxlock2mtx(c); if (!mtx_owned(m)) MPASS(mtx_unowned(m)); else { MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); /* Perform the non-mtx related part of mtx_unlock_spin(). */ if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin) spinlock_exit(); else TD_LOCKS_DEC(curthread); lock_profile_release_lock(&m->lock_object); /* Tell witness this isn't locked to make it happy. */ WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__, __LINE__); } m->mtx_lock = MTX_DESTROYED; lock_destroy(&m->lock_object); } /* * Intialize the mutex code and system mutexes. This is called from the MD * startup code prior to mi_startup(). The per-CPU data space needs to be * setup before this is called. */ void mutex_init(void) { /* Setup turnstiles so that sleep mutexes work. */ init_turnstiles(); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN); blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN); mtx_init(&proc0.p_statmtx, "pstatl", NULL, MTX_SPIN); mtx_init(&proc0.p_itimmtx, "pitiml", NULL, MTX_SPIN); mtx_init(&proc0.p_profmtx, "pprofl", NULL, MTX_SPIN); mtx_init(&devmtx, "cdev", NULL, MTX_DEF); mtx_lock(&Giant); } #ifdef DDB void db_show_mtx(const struct lock_object *lock) { struct thread *td; const struct mtx *m; m = (const struct mtx *)lock; db_printf(" flags: {"); if (LOCK_CLASS(lock) == &lock_class_mtx_spin) db_printf("SPIN"); else db_printf("DEF"); if (m->lock_object.lo_flags & LO_RECURSABLE) db_printf(", RECURSE"); if (m->lock_object.lo_flags & LO_DUPOK) db_printf(", DUPOK"); db_printf("}\n"); db_printf(" state: {"); if (mtx_unowned(m)) db_printf("UNOWNED"); else if (mtx_destroyed(m)) db_printf("DESTROYED"); else { db_printf("OWNED"); if (m->mtx_lock & MTX_CONTESTED) db_printf(", CONTESTED"); if (m->mtx_lock & MTX_RECURSED) db_printf(", RECURSED"); } db_printf("}\n"); if (!mtx_unowned(m) && !mtx_destroyed(m)) { td = mtx_owner(m); db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (mtx_recursed(m)) db_printf(" recursed: %d\n", m->mtx_recurse); } } #endif Index: head/sys/kern/kern_osd.c =================================================================== --- head/sys/kern/kern_osd.c (revision 326270) +++ head/sys/kern/kern_osd.c (revision 326271) @@ -1,443 +1,445 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* OSD (Object Specific Data) */ /* * Lock key: * (m) osd_module_lock * (o) osd_object_lock * (l) osd_list_lock */ struct osd_master { struct sx osd_module_lock; struct rmlock osd_object_lock; struct mtx osd_list_lock; LIST_HEAD(, osd) osd_list; /* (l) */ osd_destructor_t *osd_destructors; /* (o) */ osd_method_t *osd_methods; /* (m) */ u_int osd_ntslots; /* (m) */ const u_int osd_nmethods; }; static MALLOC_DEFINE(M_OSD, "osd", "Object Specific Data"); static int osd_debug = 0; SYSCTL_INT(_debug, OID_AUTO, osd, CTLFLAG_RWTUN, &osd_debug, 0, "OSD debug level"); #define OSD_DEBUG(...) do { \ if (osd_debug) { \ printf("OSD (%s:%u): ", __func__, __LINE__); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) static void do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked); /* * List of objects with OSD. */ struct osd_master osdm[OSD_LAST + 1] = { [OSD_JAIL] = { .osd_nmethods = PR_MAXMETHOD }, }; static void osd_default_destructor(void *value __unused) { /* Do nothing. */ } int osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods) { void *newptr; u_int i, m; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); /* * If no destructor is given, use default one. We need to use some * destructor, because NULL destructor means unused slot. */ if (destructor == NULL) destructor = osd_default_destructor; sx_xlock(&osdm[type].osd_module_lock); /* * First, we try to find unused slot. */ for (i = 0; i < osdm[type].osd_ntslots; i++) { if (osdm[type].osd_destructors[i] == NULL) { OSD_DEBUG("Unused slot found (type=%u, slot=%u).", type, i); break; } } /* * If no unused slot was found, allocate one. */ if (i == osdm[type].osd_ntslots) { osdm[type].osd_ntslots++; if (osdm[type].osd_nmethods != 0) osdm[type].osd_methods = realloc(osdm[type].osd_methods, sizeof(osd_method_t) * osdm[type].osd_ntslots * osdm[type].osd_nmethods, M_OSD, M_WAITOK); newptr = malloc(sizeof(osd_destructor_t) * osdm[type].osd_ntslots, M_OSD, M_WAITOK); rm_wlock(&osdm[type].osd_object_lock); bcopy(osdm[type].osd_destructors, newptr, sizeof(osd_destructor_t) * i); free(osdm[type].osd_destructors, M_OSD); osdm[type].osd_destructors = newptr; rm_wunlock(&osdm[type].osd_object_lock); OSD_DEBUG("New slot allocated (type=%u, slot=%u).", type, i + 1); } osdm[type].osd_destructors[i] = destructor; if (osdm[type].osd_nmethods != 0) { for (m = 0; m < osdm[type].osd_nmethods; m++) osdm[type].osd_methods[i * osdm[type].osd_nmethods + m] = methods != NULL ? methods[m] : NULL; } sx_xunlock(&osdm[type].osd_module_lock); return (i + 1); } void osd_deregister(u_int type, u_int slot) { struct osd *osd, *tosd; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); sx_xlock(&osdm[type].osd_module_lock); rm_wlock(&osdm[type].osd_object_lock); /* * Free all OSD for the given slot. */ mtx_lock(&osdm[type].osd_list_lock); LIST_FOREACH_SAFE(osd, &osdm[type].osd_list, osd_next, tosd) do_osd_del(type, osd, slot, 1); mtx_unlock(&osdm[type].osd_list_lock); /* * Set destructor to NULL to free the slot. */ osdm[type].osd_destructors[slot - 1] = NULL; if (slot == osdm[type].osd_ntslots) { osdm[type].osd_ntslots--; osdm[type].osd_destructors = realloc(osdm[type].osd_destructors, sizeof(osd_destructor_t) * osdm[type].osd_ntslots, M_OSD, M_NOWAIT | M_ZERO); if (osdm[type].osd_nmethods != 0) osdm[type].osd_methods = realloc(osdm[type].osd_methods, sizeof(osd_method_t) * osdm[type].osd_ntslots * osdm[type].osd_nmethods, M_OSD, M_NOWAIT | M_ZERO); /* * We always reallocate to smaller size, so we assume it will * always succeed. */ KASSERT(osdm[type].osd_destructors != NULL && (osdm[type].osd_nmethods == 0 || osdm[type].osd_methods != NULL), ("realloc() failed")); OSD_DEBUG("Deregistration of the last slot (type=%u, slot=%u).", type, slot); } else { OSD_DEBUG("Slot deregistration (type=%u, slot=%u).", type, slot); } rm_wunlock(&osdm[type].osd_object_lock); sx_xunlock(&osdm[type].osd_module_lock); } int osd_set(u_int type, struct osd *osd, u_int slot, void *value) { return (osd_set_reserved(type, osd, slot, NULL, value)); } void ** osd_reserve(u_int slot) { KASSERT(slot > 0, ("Invalid slot.")); OSD_DEBUG("Reserving slot array (slot=%u).", slot); return (malloc(sizeof(void *) * slot, M_OSD, M_WAITOK | M_ZERO)); } int osd_set_reserved(u_int type, struct osd *osd, u_int slot, void **rsv, void *value) { struct rm_priotracker tracker; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); rm_rlock(&osdm[type].osd_object_lock, &tracker); if (slot > osd->osd_nslots) { void **newptr; if (value == NULL) { OSD_DEBUG( "Not allocating null slot (type=%u, slot=%u).", type, slot); rm_runlock(&osdm[type].osd_object_lock, &tracker); if (rsv) osd_free_reserved(rsv); return (0); } /* * Too few slots allocated here, so we need to extend or create * the array. */ if (rsv) { /* * Use the reserve passed in (assumed to be * the right size). */ newptr = rsv; if (osd->osd_nslots != 0) { memcpy(newptr, osd->osd_slots, sizeof(void *) * osd->osd_nslots); free(osd->osd_slots, M_OSD); } } else { newptr = realloc(osd->osd_slots, sizeof(void *) * slot, M_OSD, M_NOWAIT | M_ZERO); if (newptr == NULL) { rm_runlock(&osdm[type].osd_object_lock, &tracker); return (ENOMEM); } } if (osd->osd_nslots == 0) { /* * First OSD for this object, so we need to put it * onto the list. */ mtx_lock(&osdm[type].osd_list_lock); LIST_INSERT_HEAD(&osdm[type].osd_list, osd, osd_next); mtx_unlock(&osdm[type].osd_list_lock); OSD_DEBUG("Setting first slot (type=%u).", type); } else OSD_DEBUG("Growing slots array (type=%u).", type); osd->osd_slots = newptr; osd->osd_nslots = slot; } else if (rsv) osd_free_reserved(rsv); OSD_DEBUG("Setting slot value (type=%u, slot=%u, value=%p).", type, slot, value); osd->osd_slots[slot - 1] = value; rm_runlock(&osdm[type].osd_object_lock, &tracker); return (0); } void osd_free_reserved(void **rsv) { OSD_DEBUG("Discarding reserved slot array."); free(rsv, M_OSD); } void * osd_get(u_int type, struct osd *osd, u_int slot) { struct rm_priotracker tracker; void *value; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); rm_rlock(&osdm[type].osd_object_lock, &tracker); if (slot > osd->osd_nslots) { value = NULL; OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot); } else { value = osd->osd_slots[slot - 1]; OSD_DEBUG("Returning slot value (type=%u, slot=%u, value=%p).", type, slot, value); } rm_runlock(&osdm[type].osd_object_lock, &tracker); return (value); } void osd_del(u_int type, struct osd *osd, u_int slot) { struct rm_priotracker tracker; rm_rlock(&osdm[type].osd_object_lock, &tracker); do_osd_del(type, osd, slot, 0); rm_runlock(&osdm[type].osd_object_lock, &tracker); } static void do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked) { int i; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); OSD_DEBUG("Deleting slot (type=%u, slot=%u).", type, slot); if (slot > osd->osd_nslots) { OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot); return; } if (osd->osd_slots[slot - 1] != NULL) { osdm[type].osd_destructors[slot - 1](osd->osd_slots[slot - 1]); osd->osd_slots[slot - 1] = NULL; } for (i = osd->osd_nslots - 1; i >= 0; i--) { if (osd->osd_slots[i] != NULL) { OSD_DEBUG("Slot still has a value (type=%u, slot=%u).", type, i + 1); break; } } if (i == -1) { /* No values left for this object. */ OSD_DEBUG("No more slots left (type=%u).", type); if (!list_locked) mtx_lock(&osdm[type].osd_list_lock); LIST_REMOVE(osd, osd_next); if (!list_locked) mtx_unlock(&osdm[type].osd_list_lock); free(osd->osd_slots, M_OSD); osd->osd_slots = NULL; osd->osd_nslots = 0; } else if (slot == osd->osd_nslots) { /* This was the last slot. */ osd->osd_slots = realloc(osd->osd_slots, sizeof(void *) * (i + 1), M_OSD, M_NOWAIT | M_ZERO); /* * We always reallocate to smaller size, so we assume it will * always succeed. */ KASSERT(osd->osd_slots != NULL, ("realloc() failed")); osd->osd_nslots = i + 1; OSD_DEBUG("Reducing slots array to %u (type=%u).", osd->osd_nslots, type); } } int osd_call(u_int type, u_int method, void *obj, void *data) { osd_method_t methodfun; int error, i; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(method < osdm[type].osd_nmethods, ("Invalid method.")); /* * Call this method for every slot that defines it, stopping if an * error is encountered. */ error = 0; sx_slock(&osdm[type].osd_module_lock); for (i = 0; i < osdm[type].osd_ntslots; i++) { methodfun = osdm[type].osd_methods[i * osdm[type].osd_nmethods + method]; if (methodfun != NULL && (error = methodfun(obj, data)) != 0) break; } sx_sunlock(&osdm[type].osd_module_lock); return (error); } void osd_exit(u_int type, struct osd *osd) { struct rm_priotracker tracker; u_int i; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); if (osd->osd_nslots == 0) { KASSERT(osd->osd_slots == NULL, ("Non-null osd_slots.")); /* No OSD attached, just leave. */ return; } rm_rlock(&osdm[type].osd_object_lock, &tracker); for (i = 1; i <= osd->osd_nslots; i++) { if (osdm[type].osd_destructors[i - 1] != NULL) do_osd_del(type, osd, i, 0); else OSD_DEBUG("Unused slot (type=%u, slot=%u).", type, i); } rm_runlock(&osdm[type].osd_object_lock, &tracker); OSD_DEBUG("Object exit (type=%u).", type); } static void osd_init(void *arg __unused) { u_int i; for (i = OSD_FIRST; i <= OSD_LAST; i++) { sx_init(&osdm[i].osd_module_lock, "osd_module"); rm_init(&osdm[i].osd_object_lock, "osd_object"); mtx_init(&osdm[i].osd_list_lock, "osd_list", NULL, MTX_DEF); LIST_INIT(&osdm[i].osd_list); osdm[i].osd_destructors = NULL; osdm[i].osd_ntslots = 0; osdm[i].osd_methods = NULL; } } SYSINIT(osd, SI_SUB_LOCK, SI_ORDER_ANY, osd_init, NULL); Index: head/sys/kern/kern_physio.c =================================================================== --- head/sys/kern/kern_physio.c (revision 326270) +++ head/sys/kern/kern_physio.c (revision 326271) @@ -1,225 +1,227 @@ /*- + * SPDX-License-Identifier: BSD-4-Clause + * * Copyright (c) 1994 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include int physio(struct cdev *dev, struct uio *uio, int ioflag) { struct cdevsw *csw; struct buf *pbuf; struct bio *bp; struct vm_page **pages; caddr_t sa; u_int iolen, poff; int error, i, npages, maxpages; vm_prot_t prot; csw = dev->si_devsw; /* check if character device is being destroyed */ if (csw == NULL) return (ENXIO); /* XXX: sanity check */ if(dev->si_iosize_max < PAGE_SIZE) { printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n", devtoname(dev), dev->si_iosize_max); dev->si_iosize_max = DFLTPHYS; } /* * If the driver does not want I/O to be split, that means that we * need to reject any requests that will not fit into one buffer. */ if (dev->si_flags & SI_NOSPLIT && (uio->uio_resid > dev->si_iosize_max || uio->uio_resid > MAXPHYS || uio->uio_iovcnt > 1)) { /* * Tell the user why his I/O was rejected. */ if (uio->uio_resid > dev->si_iosize_max) uprintf("%s: request size=%zd > si_iosize_max=%d; " "cannot split request\n", devtoname(dev), uio->uio_resid, dev->si_iosize_max); if (uio->uio_resid > MAXPHYS) uprintf("%s: request size=%zd > MAXPHYS=%d; " "cannot split request\n", devtoname(dev), uio->uio_resid, MAXPHYS); if (uio->uio_iovcnt > 1) uprintf("%s: request vectors=%d > 1; " "cannot split request\n", devtoname(dev), uio->uio_iovcnt); return (EFBIG); } /* * Keep the process UPAGES from being swapped. Processes swapped * out while holding pbufs, used by swapper, may lead to deadlock. */ PHOLD(curproc); bp = g_alloc_bio(); if (uio->uio_segflg != UIO_USERSPACE) { pbuf = NULL; pages = NULL; } else if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { pbuf = NULL; maxpages = btoc(MIN(uio->uio_resid, MAXPHYS)) + 1; pages = malloc(sizeof(*pages) * maxpages, M_DEVBUF, M_WAITOK); } else { pbuf = getpbuf(NULL); sa = pbuf->b_data; maxpages = btoc(MAXPHYS); pages = pbuf->b_pages; } prot = VM_PROT_READ; if (uio->uio_rw == UIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ error = 0; for (i = 0; i < uio->uio_iovcnt; i++) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); if (uio->uio_rw == UIO_READ) { racct_add_force(curproc, RACCT_READBPS, uio->uio_iov[i].iov_len); racct_add_force(curproc, RACCT_READIOPS, 1); } else { racct_add_force(curproc, RACCT_WRITEBPS, uio->uio_iov[i].iov_len); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } PROC_UNLOCK(curproc); } #endif /* RACCT */ while (uio->uio_iov[i].iov_len) { g_reset_bio(bp); if (uio->uio_rw == UIO_READ) { bp->bio_cmd = BIO_READ; curthread->td_ru.ru_inblock++; } else { bp->bio_cmd = BIO_WRITE; curthread->td_ru.ru_oublock++; } bp->bio_offset = uio->uio_offset; bp->bio_data = uio->uio_iov[i].iov_base; bp->bio_length = uio->uio_iov[i].iov_len; if (bp->bio_length > dev->si_iosize_max) bp->bio_length = dev->si_iosize_max; if (bp->bio_length > MAXPHYS) bp->bio_length = MAXPHYS; /* * Make sure the pbuf can map the request. * The pbuf has kvasize = MAXPHYS, so a request * larger than MAXPHYS - PAGE_SIZE must be * page aligned or it will be fragmented. */ poff = (vm_offset_t)bp->bio_data & PAGE_MASK; if (pbuf && bp->bio_length + poff > pbuf->b_kvasize) { if (dev->si_flags & SI_NOSPLIT) { uprintf("%s: request ptr %p is not " "on a page boundary; cannot split " "request\n", devtoname(dev), bp->bio_data); error = EFBIG; goto doerror; } bp->bio_length = pbuf->b_kvasize; if (poff != 0) bp->bio_length -= PAGE_SIZE; } bp->bio_bcount = bp->bio_length; bp->bio_dev = dev; if (pages) { if ((npages = vm_fault_quick_hold_pages( &curproc->p_vmspace->vm_map, (vm_offset_t)bp->bio_data, bp->bio_length, prot, pages, maxpages)) < 0) { error = EFAULT; goto doerror; } if (pbuf) { pmap_qenter((vm_offset_t)sa, pages, npages); bp->bio_data = sa + poff; } else { bp->bio_ma = pages; bp->bio_ma_n = npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; } } csw->d_strategy(bp); if (uio->uio_rw == UIO_READ) biowait(bp, "physrd"); else biowait(bp, "physwr"); if (pages) { if (pbuf) pmap_qremove((vm_offset_t)sa, npages); vm_page_unhold_pages(pages, npages); } iolen = bp->bio_length - bp->bio_resid; if (iolen == 0 && !(bp->bio_flags & BIO_ERROR)) goto doerror; /* EOF */ uio->uio_iov[i].iov_len -= iolen; uio->uio_iov[i].iov_base = (char *)uio->uio_iov[i].iov_base + iolen; uio->uio_resid -= iolen; uio->uio_offset += iolen; if (bp->bio_flags & BIO_ERROR) { error = bp->bio_error; goto doerror; } } } doerror: if (pbuf) relpbuf(pbuf, NULL); else if (pages) free(pages, M_DEVBUF); g_destroy_bio(bp); PRELE(curproc); return (error); } Index: head/sys/kern/kern_pmc.c =================================================================== --- head/sys/kern/kern_pmc.c (revision 326270) +++ head/sys/kern/kern_pmc.c (revision 326271) @@ -1,344 +1,346 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003-2008 Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS FEATURE(hwpmc_hooks, "Kernel support for HW PMC"); #define PMC_KERNEL_VERSION PMC_VERSION #else #define PMC_KERNEL_VERSION 0 #endif MALLOC_DECLARE(M_PMCHOOKS); MALLOC_DEFINE(M_PMCHOOKS, "pmchooks", "Memory space for PMC hooks"); const int pmc_kernel_version = PMC_KERNEL_VERSION; /* Hook variable. */ int __read_mostly (*pmc_hook)(struct thread *td, int function, void *arg) = NULL; /* Interrupt handler */ int __read_mostly (*pmc_intr)(int cpu, struct trapframe *tf) = NULL; /* Bitmask of CPUs requiring servicing at hardclock time */ volatile cpuset_t pmc_cpumask; /* * A global count of SS mode PMCs. When non-zero, this means that * we have processes that are sampling the system as a whole. */ volatile int pmc_ss_count; /* * Since PMC(4) may not be loaded in the current kernel, the * convention followed is that a non-NULL value of 'pmc_hook' implies * the presence of this kernel module. * * This requires us to protect 'pmc_hook' with a * shared (sx) lock -- thus making the process of calling into PMC(4) * somewhat more expensive than a simple 'if' check and indirect call. */ struct sx pmc_sx; /* * PMC Soft per cpu trapframe. */ struct trapframe pmc_tf[MAXCPU]; /* * PMC Soft use a global table to store registered events. */ SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters"); static int pmc_softevents = 16; SYSCTL_INT(_kern_hwpmc, OID_AUTO, softevents, CTLFLAG_RDTUN, &pmc_softevents, 0, "maximum number of soft events"); struct mtx pmc_softs_mtx; int pmc_softs_count; struct pmc_soft **pmc_softs; MTX_SYSINIT(pmc_soft_mtx, &pmc_softs_mtx, "pmc-softs", MTX_SPIN); static void pmc_init_sx(void) { sx_init_flags(&pmc_sx, "pmc-sx", SX_NOWITNESS); } SYSINIT(pmcsx, SI_SUB_LOCK, SI_ORDER_MIDDLE, pmc_init_sx, NULL); /* * Helper functions. */ /* * A note on the CPU numbering scheme used by the hwpmc(4) driver. * * CPUs are denoted using numbers in the range 0..[pmc_cpu_max()-1]. * CPUs could be numbered "sparsely" in this range; the predicate * `pmc_cpu_is_present()' is used to test whether a given CPU is * physically present. * * Further, a CPU that is physically present may be administratively * disabled or otherwise unavailable for use by hwpmc(4). The * `pmc_cpu_is_active()' predicate tests for CPU usability. An * "active" CPU participates in thread scheduling and can field * interrupts raised by PMC hardware. * * On systems with hyperthreaded CPUs, multiple logical CPUs may share * PMC hardware resources. For such processors one logical CPU is * denoted as the primary owner of the in-CPU PMC resources. The * pmc_cpu_is_primary() predicate is used to distinguish this primary * CPU from the others. */ int pmc_cpu_is_active(int cpu) { #ifdef SMP return (pmc_cpu_is_present(cpu) && !CPU_ISSET(cpu, &hlt_cpus_mask)); #else return (1); #endif } /* Deprecated. */ int pmc_cpu_is_disabled(int cpu) { return (!pmc_cpu_is_active(cpu)); } int pmc_cpu_is_present(int cpu) { #ifdef SMP return (!CPU_ABSENT(cpu)); #else return (1); #endif } int pmc_cpu_is_primary(int cpu) { #ifdef SMP return (!CPU_ISSET(cpu, &logical_cpus_mask)); #else return (1); #endif } /* * Return the maximum CPU number supported by the system. The return * value is used for scaling internal data structures and for runtime * checks. */ unsigned int pmc_cpu_max(void) { #ifdef SMP return (mp_maxid+1); #else return (1); #endif } #ifdef INVARIANTS /* * Return the count of CPUs in the `active' state in the system. */ int pmc_cpu_max_active(void) { #ifdef SMP /* * When support for CPU hot-plugging is added to the kernel, * this function would change to return the current number * of "active" CPUs. */ return (mp_ncpus); #else return (1); #endif } #endif /* * Cleanup event name: * - remove duplicate '_' * - all uppercase */ static void pmc_soft_namecleanup(char *name) { char *p, *q; p = q = name; for ( ; *p == '_' ; p++) ; for ( ; *p ; p++) { if (*p == '_' && (*(p + 1) == '_' || *(p + 1) == '\0')) continue; else *q++ = toupper(*p); } *q = '\0'; } void pmc_soft_ev_register(struct pmc_soft *ps) { static int warned = 0; int n; ps->ps_running = 0; ps->ps_ev.pm_ev_code = 0; /* invalid */ pmc_soft_namecleanup(ps->ps_ev.pm_ev_name); mtx_lock_spin(&pmc_softs_mtx); if (pmc_softs_count >= pmc_softevents) { /* * XXX Reusing events can enter a race condition where * new allocated event will be used as an old one. */ for (n = 0; n < pmc_softevents; n++) if (pmc_softs[n] == NULL) break; if (n == pmc_softevents) { mtx_unlock_spin(&pmc_softs_mtx); if (!warned) { printf("hwpmc: too many soft events, " "increase kern.hwpmc.softevents tunable\n"); warned = 1; } return; } ps->ps_ev.pm_ev_code = PMC_EV_SOFT_FIRST + n; pmc_softs[n] = ps; } else { ps->ps_ev.pm_ev_code = PMC_EV_SOFT_FIRST + pmc_softs_count; pmc_softs[pmc_softs_count++] = ps; } mtx_unlock_spin(&pmc_softs_mtx); } void pmc_soft_ev_deregister(struct pmc_soft *ps) { KASSERT(ps != NULL, ("pmc_soft_deregister: called with NULL")); mtx_lock_spin(&pmc_softs_mtx); if (ps->ps_ev.pm_ev_code != 0 && (ps->ps_ev.pm_ev_code - PMC_EV_SOFT_FIRST) < pmc_softevents) { KASSERT((int)ps->ps_ev.pm_ev_code >= PMC_EV_SOFT_FIRST && (int)ps->ps_ev.pm_ev_code <= PMC_EV_SOFT_LAST, ("pmc_soft_deregister: invalid event value")); pmc_softs[ps->ps_ev.pm_ev_code - PMC_EV_SOFT_FIRST] = NULL; } mtx_unlock_spin(&pmc_softs_mtx); } struct pmc_soft * pmc_soft_ev_acquire(enum pmc_event ev) { struct pmc_soft *ps; if (ev == 0 || (ev - PMC_EV_SOFT_FIRST) >= pmc_softevents) return NULL; KASSERT((int)ev >= PMC_EV_SOFT_FIRST && (int)ev <= PMC_EV_SOFT_LAST, ("event out of range")); mtx_lock_spin(&pmc_softs_mtx); ps = pmc_softs[ev - PMC_EV_SOFT_FIRST]; if (ps == NULL) mtx_unlock_spin(&pmc_softs_mtx); return ps; } void pmc_soft_ev_release(struct pmc_soft *ps) { mtx_unlock_spin(&pmc_softs_mtx); } /* * Initialise hwpmc. */ static void init_hwpmc(void *dummy __unused) { if (pmc_softevents <= 0 || pmc_softevents > PMC_EV_DYN_COUNT) { (void) printf("hwpmc: tunable \"softevents\"=%d out of " "range.\n", pmc_softevents); pmc_softevents = PMC_EV_DYN_COUNT; } pmc_softs = malloc(pmc_softevents * sizeof(struct pmc_soft *), M_PMCHOOKS, M_NOWAIT|M_ZERO); KASSERT(pmc_softs != NULL, ("cannot allocate soft events table")); } SYSINIT(hwpmc, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_hwpmc, NULL); Index: head/sys/kern/kern_poll.c =================================================================== --- head/sys/kern/kern_poll.c (revision 326270) +++ head/sys/kern/kern_poll.c (revision 326271) @@ -1,574 +1,576 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2001-2002 Luigi Rizzo * * Supported by: the Xorp Project (www.xorp.org) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_device_polling.h" #include #include #include #include #include #include #include #include /* needed by net/if.h */ #include #include #include #include #include #include /* for NETISR_POLL */ #include void hardclock_device_poll(void); /* hook from hardclock */ static struct mtx poll_mtx; /* * Polling support for [network] device drivers. * * Drivers which support this feature can register with the * polling code. * * If registration is successful, the driver must disable interrupts, * and further I/O is performed through the handler, which is invoked * (at least once per clock tick) with 3 arguments: the "arg" passed at * register time (a struct ifnet pointer), a command, and a "count" limit. * * The command can be one of the following: * POLL_ONLY: quick move of "count" packets from input/output queues. * POLL_AND_CHECK_STATUS: as above, plus check status registers or do * other more expensive operations. This command is issued periodically * but less frequently than POLL_ONLY. * * The count limit specifies how much work the handler can do during the * call -- typically this is the number of packets to be received, or * transmitted, etc. (drivers are free to interpret this number, as long * as the max time spent in the function grows roughly linearly with the * count). * * Polling is enabled and disabled via setting IFCAP_POLLING flag on * the interface. The driver ioctl handler should register interface * with polling and disable interrupts, if registration was successful. * * A second variable controls the sharing of CPU between polling/kernel * network processing, and other activities (typically userlevel tasks): * kern.polling.user_frac (between 0 and 100, default 50) sets the share * of CPU allocated to user tasks. CPU is allocated proportionally to the * shares, by dynamically adjusting the "count" (poll_burst). * * Other parameters can should be left to their default values. * The following constraints hold * * 1 <= poll_each_burst <= poll_burst <= poll_burst_max * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX */ #define MIN_POLL_BURST_MAX 10 #define MAX_POLL_BURST_MAX 20000 static uint32_t poll_burst = 5; static uint32_t poll_burst_max = 150; /* good for 100Mbit net and HZ=1000 */ static uint32_t poll_each_burst = 5; static SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0, "Device polling parameters"); SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RD, &poll_burst, 0, "Current polling burst size"); static int netisr_poll_scheduled; static int netisr_pollmore_scheduled; static int poll_shutting_down; static int poll_burst_max_sysctl(SYSCTL_HANDLER_ARGS) { uint32_t val = poll_burst_max; int error; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); if (val < MIN_POLL_BURST_MAX || val > MAX_POLL_BURST_MAX) return (EINVAL); mtx_lock(&poll_mtx); poll_burst_max = val; if (poll_burst > poll_burst_max) poll_burst = poll_burst_max; if (poll_each_burst > poll_burst_max) poll_each_burst = MIN_POLL_BURST_MAX; mtx_unlock(&poll_mtx); return (0); } SYSCTL_PROC(_kern_polling, OID_AUTO, burst_max, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(uint32_t), poll_burst_max_sysctl, "I", "Max Polling burst size"); static int poll_each_burst_sysctl(SYSCTL_HANDLER_ARGS) { uint32_t val = poll_each_burst; int error; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); if (val < 1) return (EINVAL); mtx_lock(&poll_mtx); if (val > poll_burst_max) { mtx_unlock(&poll_mtx); return (EINVAL); } poll_each_burst = val; mtx_unlock(&poll_mtx); return (0); } SYSCTL_PROC(_kern_polling, OID_AUTO, each_burst, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(uint32_t), poll_each_burst_sysctl, "I", "Max size of each burst"); static uint32_t poll_in_idle_loop=0; /* do we poll in idle loop ? */ SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW, &poll_in_idle_loop, 0, "Enable device polling in idle loop"); static uint32_t user_frac = 50; static int user_frac_sysctl(SYSCTL_HANDLER_ARGS) { uint32_t val = user_frac; int error; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); if (val > 99) return (EINVAL); mtx_lock(&poll_mtx); user_frac = val; mtx_unlock(&poll_mtx); return (0); } SYSCTL_PROC(_kern_polling, OID_AUTO, user_frac, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(uint32_t), user_frac_sysctl, "I", "Desired user fraction of cpu time"); static uint32_t reg_frac_count = 0; static uint32_t reg_frac = 20 ; static int reg_frac_sysctl(SYSCTL_HANDLER_ARGS) { uint32_t val = reg_frac; int error; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); if (val < 1 || val > hz) return (EINVAL); mtx_lock(&poll_mtx); reg_frac = val; if (reg_frac_count >= reg_frac) reg_frac_count = 0; mtx_unlock(&poll_mtx); return (0); } SYSCTL_PROC(_kern_polling, OID_AUTO, reg_frac, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(uint32_t), reg_frac_sysctl, "I", "Every this many cycles check registers"); static uint32_t short_ticks; SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RD, &short_ticks, 0, "Hardclock ticks shorter than they should be"); static uint32_t lost_polls; SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RD, &lost_polls, 0, "How many times we would have lost a poll tick"); static uint32_t pending_polls; SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RD, &pending_polls, 0, "Do we need to poll again"); static int residual_burst = 0; SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RD, &residual_burst, 0, "# of residual cycles in burst"); static uint32_t poll_handlers; /* next free entry in pr[]. */ SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD, &poll_handlers, 0, "Number of registered poll handlers"); static uint32_t phase; SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RD, &phase, 0, "Polling phase"); static uint32_t suspect; SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RD, &suspect, 0, "suspect event"); static uint32_t stalled; SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RD, &stalled, 0, "potential stalls"); static uint32_t idlepoll_sleeping; /* idlepoll is sleeping */ SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD, &idlepoll_sleeping, 0, "idlepoll is sleeping"); #define POLL_LIST_LEN 128 struct pollrec { poll_handler_t *handler; struct ifnet *ifp; }; static struct pollrec pr[POLL_LIST_LEN]; static void poll_shutdown(void *arg, int howto) { poll_shutting_down = 1; } static void init_device_poll(void) { mtx_init(&poll_mtx, "polling", NULL, MTX_DEF); EVENTHANDLER_REGISTER(shutdown_post_sync, poll_shutdown, NULL, SHUTDOWN_PRI_LAST); } SYSINIT(device_poll, SI_SUB_SOFTINTR, SI_ORDER_MIDDLE, init_device_poll, NULL); /* * Hook from hardclock. Tries to schedule a netisr, but keeps track * of lost ticks due to the previous handler taking too long. * Normally, this should not happen, because polling handler should * run for a short time. However, in some cases (e.g. when there are * changes in link status etc.) the drivers take a very long time * (even in the order of milliseconds) to reset and reconfigure the * device, causing apparent lost polls. * * The first part of the code is just for debugging purposes, and tries * to count how often hardclock ticks are shorter than they should, * meaning either stray interrupts or delayed events. */ void hardclock_device_poll(void) { static struct timeval prev_t, t; int delta; if (poll_handlers == 0 || poll_shutting_down) return; microuptime(&t); delta = (t.tv_usec - prev_t.tv_usec) + (t.tv_sec - prev_t.tv_sec)*1000000; if (delta * hz < 500000) short_ticks++; else prev_t = t; if (pending_polls > 100) { /* * Too much, assume it has stalled (not always true * see comment above). */ stalled++; pending_polls = 0; phase = 0; } if (phase <= 2) { if (phase != 0) suspect++; phase = 1; netisr_poll_scheduled = 1; netisr_pollmore_scheduled = 1; netisr_sched_poll(); phase = 2; } if (pending_polls++ > 0) lost_polls++; } /* * ether_poll is called from the idle loop. */ static void ether_poll(int count) { int i; mtx_lock(&poll_mtx); if (count > poll_each_burst) count = poll_each_burst; for (i = 0 ; i < poll_handlers ; i++) pr[i].handler(pr[i].ifp, POLL_ONLY, count); mtx_unlock(&poll_mtx); } /* * netisr_pollmore is called after other netisr's, possibly scheduling * another NETISR_POLL call, or adapting the burst size for the next cycle. * * It is very bad to fetch large bursts of packets from a single card at once, * because the burst could take a long time to be completely processed, or * could saturate the intermediate queue (ipintrq or similar) leading to * losses or unfairness. To reduce the problem, and also to account better for * time spent in network-related processing, we split the burst in smaller * chunks of fixed size, giving control to the other netisr's between chunks. * This helps in improving the fairness, reducing livelock (because we * emulate more closely the "process to completion" that we have with * fastforwarding) and accounting for the work performed in low level * handling and forwarding. */ static struct timeval poll_start_t; void netisr_pollmore() { struct timeval t; int kern_load; if (poll_handlers == 0) return; mtx_lock(&poll_mtx); if (!netisr_pollmore_scheduled) { mtx_unlock(&poll_mtx); return; } netisr_pollmore_scheduled = 0; phase = 5; if (residual_burst > 0) { netisr_poll_scheduled = 1; netisr_pollmore_scheduled = 1; netisr_sched_poll(); mtx_unlock(&poll_mtx); /* will run immediately on return, followed by netisrs */ return; } /* here we can account time spent in netisr's in this tick */ microuptime(&t); kern_load = (t.tv_usec - poll_start_t.tv_usec) + (t.tv_sec - poll_start_t.tv_sec)*1000000; /* us */ kern_load = (kern_load * hz) / 10000; /* 0..100 */ if (kern_load > (100 - user_frac)) { /* try decrease ticks */ if (poll_burst > 1) poll_burst--; } else { if (poll_burst < poll_burst_max) poll_burst++; } pending_polls--; if (pending_polls == 0) /* we are done */ phase = 0; else { /* * Last cycle was long and caused us to miss one or more * hardclock ticks. Restart processing again, but slightly * reduce the burst size to prevent that this happens again. */ poll_burst -= (poll_burst / 8); if (poll_burst < 1) poll_burst = 1; netisr_poll_scheduled = 1; netisr_pollmore_scheduled = 1; netisr_sched_poll(); phase = 6; } mtx_unlock(&poll_mtx); } /* * netisr_poll is typically scheduled once per tick. */ void netisr_poll(void) { int i, cycles; enum poll_cmd arg = POLL_ONLY; if (poll_handlers == 0) return; mtx_lock(&poll_mtx); if (!netisr_poll_scheduled) { mtx_unlock(&poll_mtx); return; } netisr_poll_scheduled = 0; phase = 3; if (residual_burst == 0) { /* first call in this tick */ microuptime(&poll_start_t); if (++reg_frac_count == reg_frac) { arg = POLL_AND_CHECK_STATUS; reg_frac_count = 0; } residual_burst = poll_burst; } cycles = (residual_burst < poll_each_burst) ? residual_burst : poll_each_burst; residual_burst -= cycles; for (i = 0 ; i < poll_handlers ; i++) pr[i].handler(pr[i].ifp, arg, cycles); phase = 4; mtx_unlock(&poll_mtx); } /* * Try to register routine for polling. Returns 0 if successful * (and polling should be enabled), error code otherwise. * A device is not supposed to register itself multiple times. * * This is called from within the *_ioctl() functions. */ int ether_poll_register(poll_handler_t *h, if_t ifp) { int i; KASSERT(h != NULL, ("%s: handler is NULL", __func__)); KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); mtx_lock(&poll_mtx); if (poll_handlers >= POLL_LIST_LEN) { /* * List full, cannot register more entries. * This should never happen; if it does, it is probably a * broken driver trying to register multiple times. Checking * this at runtime is expensive, and won't solve the problem * anyways, so just report a few times and then give up. */ static int verbose = 10 ; if (verbose >0) { log(LOG_ERR, "poll handlers list full, " "maybe a broken driver ?\n"); verbose--; } mtx_unlock(&poll_mtx); return (ENOMEM); /* no polling for you */ } for (i = 0 ; i < poll_handlers ; i++) if (pr[i].ifp == ifp && pr[i].handler != NULL) { mtx_unlock(&poll_mtx); log(LOG_DEBUG, "ether_poll_register: %s: handler" " already registered\n", ifp->if_xname); return (EEXIST); } pr[poll_handlers].handler = h; pr[poll_handlers].ifp = ifp; poll_handlers++; mtx_unlock(&poll_mtx); if (idlepoll_sleeping) wakeup(&idlepoll_sleeping); return (0); } /* * Remove interface from the polling list. Called from *_ioctl(), too. */ int ether_poll_deregister(if_t ifp) { int i; KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); mtx_lock(&poll_mtx); for (i = 0 ; i < poll_handlers ; i++) if (pr[i].ifp == ifp) /* found it */ break; if (i == poll_handlers) { log(LOG_DEBUG, "ether_poll_deregister: %s: not found!\n", ifp->if_xname); mtx_unlock(&poll_mtx); return (ENOENT); } poll_handlers--; if (i < poll_handlers) { /* Last entry replaces this one. */ pr[i].handler = pr[poll_handlers].handler; pr[i].ifp = pr[poll_handlers].ifp; } mtx_unlock(&poll_mtx); return (0); } static void poll_idle(void) { struct thread *td = curthread; struct rtprio rtp; rtp.prio = RTP_PRIO_MAX; /* lowest priority */ rtp.type = RTP_PRIO_IDLE; PROC_SLOCK(td->td_proc); rtp_to_pri(&rtp, td); PROC_SUNLOCK(td->td_proc); for (;;) { if (poll_in_idle_loop && poll_handlers > 0) { idlepoll_sleeping = 0; ether_poll(poll_each_burst); thread_lock(td); mi_switch(SW_VOL, NULL); thread_unlock(td); } else { idlepoll_sleeping = 1; tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3); } } } static struct proc *idlepoll; static struct kproc_desc idlepoll_kp = { "idlepoll", poll_idle, &idlepoll }; SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start, &idlepoll_kp); Index: head/sys/kern/kern_priv.c =================================================================== --- head/sys/kern/kern_priv.c (revision 326270) +++ head/sys/kern/kern_priv.c (revision 326271) @@ -1,181 +1,183 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006 nCircle Network Security, Inc. * Copyright (c) 2009 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert N. M. Watson for the TrustedBSD * Project under contract to nCircle Network Security, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* * `suser_enabled' (which can be set by the security.bsd.suser_enabled * sysctl) determines whether the system 'super-user' policy is in effect. If * it is nonzero, an effective uid of 0 connotes special privilege, * overriding many mandatory and discretionary protections. If it is zero, * uid 0 is offered no special privilege in the kernel security policy. * Setting it to zero may seriously impact the functionality of many existing * userland programs, and should not be done without careful consideration of * the consequences. */ static int suser_enabled = 1; SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RWTUN, &suser_enabled, 0, "processes with uid 0 have privilege"); static int unprivileged_mlock = 1; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_mlock, CTLFLAG_RWTUN, &unprivileged_mlock, 0, "Allow non-root users to call mlock(2)"); SDT_PROVIDER_DEFINE(priv); SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv__ok, "int"); SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv__err, "int"); /* * Check a credential for privilege. Lots of good reasons to deny privilege; * only a few to grant it. */ int priv_check_cred(struct ucred *cred, int priv, int flags) { int error; KASSERT(PRIV_VALID(priv), ("priv_check_cred: invalid privilege %d", priv)); /* * We first evaluate policies that may deny the granting of * privilege unilaterally. */ #ifdef MAC error = mac_priv_check(cred, priv); if (error) goto out; #endif /* * Jail policy will restrict certain privileges that may otherwise be * be granted. */ error = prison_priv_check(cred, priv); if (error) goto out; if (unprivileged_mlock) { /* * Allow unprivileged users to call mlock(2)/munlock(2) and * mlockall(2)/munlockall(2). */ switch (priv) { case PRIV_VM_MLOCK: case PRIV_VM_MUNLOCK: error = 0; goto out; } } /* * Having determined if privilege is restricted by various policies, * now determine if privilege is granted. At this point, any policy * may grant privilege. For now, we allow short-circuit boolean * evaluation, so may not call all policies. Perhaps we should. * * Superuser policy grants privilege based on the effective (or in * the case of specific privileges, real) uid being 0. We allow the * superuser policy to be globally disabled, although this is * currenty of limited utility. */ if (suser_enabled) { switch (priv) { case PRIV_MAXFILES: case PRIV_MAXPROC: case PRIV_PROC_LIMIT: if (cred->cr_ruid == 0) { error = 0; goto out; } break; default: if (cred->cr_uid == 0) { error = 0; goto out; } break; } } /* * Writes to kernel/physical memory are a typical root-only operation, * but non-root users are expected to be able to read it (provided they * have permission to access /dev/[k]mem). */ if (priv == PRIV_KMEM_READ) { error = 0; goto out; } /* * Now check with MAC, if enabled, to see if a policy module grants * privilege. */ #ifdef MAC if (mac_priv_grant(cred, priv) == 0) { error = 0; goto out; } #endif /* * The default is deny, so if no policies have granted it, reject * with a privilege error here. */ error = EPERM; out: if (error) SDT_PROBE1(priv, kernel, priv_check, priv__err, priv); else SDT_PROBE1(priv, kernel, priv_check, priv__ok, priv); return (error); } int priv_check(struct thread *td, int priv) { KASSERT(td == curthread, ("priv_check: td != curthread")); return (priv_check_cred(td->td_ucred, priv, 0)); } Index: head/sys/kern/kern_racct.c =================================================================== --- head/sys/kern/kern_racct.c (revision 326270) +++ head/sys/kern/kern_racct.c (revision 326271) @@ -1,1342 +1,1344 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #include #endif #ifdef RACCT FEATURE(racct, "Resource Accounting"); /* * Do not block processes that have their %cpu usage <= pcpu_threshold. */ static int pcpu_threshold = 1; #ifdef RACCT_DEFAULT_TO_DISABLED int racct_enable = 0; #else int racct_enable = 1; #endif SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 0, "Enable RACCT/RCTL"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 0, "Processes with higher %cpu usage than this value can be throttled."); /* * How many seconds it takes to use the scheduler %cpu calculations. When a * process starts, we compute its %cpu usage by dividing its runtime by the * process wall clock time. After RACCT_PCPU_SECS pass, we use the value * provided by the scheduler. */ #define RACCT_PCPU_SECS 3 struct mtx racct_lock; MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); static uma_zone_t racct_zone; static void racct_sub_racct(struct racct *dest, const struct racct *src); static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); SDT_PROVIDER_DEFINE(racct); SDT_PROBE_DEFINE3(racct, , rusage, add, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__buf, "struct proc *", "const struct buf *", "int"); SDT_PROBE_DEFINE3(racct, , rusage, add__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE1(racct, , racct, create, "struct racct *"); SDT_PROBE_DEFINE1(racct, , racct, destroy, "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, join, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, join__failure, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, leave, "struct racct *", "struct racct *"); int racct_types[] = { [RACCT_CPU] = RACCT_IN_MILLIONS, [RACCT_DATA] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_STACK] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_CORE] = RACCT_DENIABLE, [RACCT_RSS] = RACCT_RECLAIMABLE, [RACCT_MEMLOCK] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NPROC] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NOFILE] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_VMEM] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NPTS] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SWAP] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NTHR] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_MSGQQUEUED] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_MSGQSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NMSGQ] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEMOP] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NSHM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SHMSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, [RACCT_READBPS] = RACCT_DECAYING, [RACCT_WRITEBPS] = RACCT_DECAYING, [RACCT_READIOPS] = RACCT_DECAYING, [RACCT_WRITEIOPS] = RACCT_DECAYING }; static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; #ifdef SCHED_4BSD /* * Contains intermediate values for %cpu calculations to avoid using floating * point in the kernel. * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to * zero so the calculations are more straightforward. */ fixpt_t ccpu_exp[] = { [0] = FSCALE * 1, [1] = FSCALE * 0.95122942450071400909, [2] = FSCALE * 0.90483741803595957316, [3] = FSCALE * 0.86070797642505780722, [4] = FSCALE * 0.81873075307798185866, [5] = FSCALE * 0.77880078307140486824, [6] = FSCALE * 0.74081822068171786606, [7] = FSCALE * 0.70468808971871343435, [8] = FSCALE * 0.67032004603563930074, [9] = FSCALE * 0.63762815162177329314, [10] = FSCALE * 0.60653065971263342360, [11] = FSCALE * 0.57694981038048669531, [12] = FSCALE * 0.54881163609402643262, [13] = FSCALE * 0.52204577676101604789, [14] = FSCALE * 0.49658530379140951470, [15] = FSCALE * 0.47236655274101470713, [16] = FSCALE * 0.44932896411722159143, [17] = FSCALE * 0.42741493194872666992, [18] = FSCALE * 0.40656965974059911188, [19] = FSCALE * 0.38674102345450120691, [20] = FSCALE * 0.36787944117144232159, [21] = FSCALE * 0.34993774911115535467, [22] = FSCALE * 0.33287108369807955328, [23] = FSCALE * 0.31663676937905321821, [24] = FSCALE * 0.30119421191220209664, [25] = FSCALE * 0.28650479686019010032, [26] = FSCALE * 0.27253179303401260312, [27] = FSCALE * 0.25924026064589150757, [28] = FSCALE * 0.24659696394160647693, [29] = FSCALE * 0.23457028809379765313, [30] = FSCALE * 0.22313016014842982893, [31] = FSCALE * 0.21224797382674305771, [32] = FSCALE * 0.20189651799465540848, [33] = FSCALE * 0.19204990862075411423, [34] = FSCALE * 0.18268352405273465022, [35] = FSCALE * 0.17377394345044512668, [36] = FSCALE * 0.16529888822158653829, [37] = FSCALE * 0.15723716631362761621, [38] = FSCALE * 0.14956861922263505264, [39] = FSCALE * 0.14227407158651357185, [40] = FSCALE * 0.13533528323661269189, [41] = FSCALE * 0.12873490358780421886, [42] = FSCALE * 0.12245642825298191021, [43] = FSCALE * 0.11648415777349695786, [44] = FSCALE * 0.11080315836233388333, [45] = FSCALE * 0.10539922456186433678, [46] = FSCALE * 0.10025884372280373372, [47] = FSCALE * 0.09536916221554961888, [48] = FSCALE * 0.09071795328941250337, [49] = FSCALE * 0.08629358649937051097, [50] = FSCALE * 0.08208499862389879516, [51] = FSCALE * 0.07808166600115315231, [52] = FSCALE * 0.07427357821433388042, [53] = FSCALE * 0.07065121306042958674, [54] = FSCALE * 0.06720551273974976512, [55] = FSCALE * 0.06392786120670757270, [56] = FSCALE * 0.06081006262521796499, [57] = FSCALE * 0.05784432087483846296, [58] = FSCALE * 0.05502322005640722902, [59] = FSCALE * 0.05233970594843239308, [60] = FSCALE * 0.04978706836786394297, [61] = FSCALE * 0.04735892439114092119, [62] = FSCALE * 0.04504920239355780606, [63] = FSCALE * 0.04285212686704017991, [64] = FSCALE * 0.04076220397836621516, [65] = FSCALE * 0.03877420783172200988, [66] = FSCALE * 0.03688316740124000544, [67] = FSCALE * 0.03508435410084502588, [68] = FSCALE * 0.03337326996032607948, [69] = FSCALE * 0.03174563637806794323, [70] = FSCALE * 0.03019738342231850073, [71] = FSCALE * 0.02872463965423942912, [72] = FSCALE * 0.02732372244729256080, [73] = FSCALE * 0.02599112877875534358, [74] = FSCALE * 0.02472352647033939120, [75] = FSCALE * 0.02351774585600910823, [76] = FSCALE * 0.02237077185616559577, [77] = FSCALE * 0.02127973643837716938, [78] = FSCALE * 0.02024191144580438847, [79] = FSCALE * 0.01925470177538692429, [80] = FSCALE * 0.01831563888873418029, [81] = FSCALE * 0.01742237463949351138, [82] = FSCALE * 0.01657267540176124754, [83] = FSCALE * 0.01576441648485449082, [84] = FSCALE * 0.01499557682047770621, [85] = FSCALE * 0.01426423390899925527, [86] = FSCALE * 0.01356855901220093175, [87] = FSCALE * 0.01290681258047986886, [88] = FSCALE * 0.01227733990306844117, [89] = FSCALE * 0.01167856697039544521, [90] = FSCALE * 0.01110899653824230649, [91] = FSCALE * 0.01056720438385265337, [92] = FSCALE * 0.01005183574463358164, [93] = FSCALE * 0.00956160193054350793, [94] = FSCALE * 0.00909527710169581709, [95] = FSCALE * 0.00865169520312063417, [96] = FSCALE * 0.00822974704902002884, [97] = FSCALE * 0.00782837754922577143, [98] = FSCALE * 0.00744658307092434051, [99] = FSCALE * 0.00708340892905212004, [100] = FSCALE * 0.00673794699908546709, [101] = FSCALE * 0.00640933344625638184, [102] = FSCALE * 0.00609674656551563610, [103] = FSCALE * 0.00579940472684214321, [104] = FSCALE * 0.00551656442076077241, [105] = FSCALE * 0.00524751839918138427, [106] = FSCALE * 0.00499159390691021621, [107] = FSCALE * 0.00474815099941147558, [108] = FSCALE * 0.00451658094261266798, [109] = FSCALE * 0.00429630469075234057, [110] = FSCALE * 0.00408677143846406699, }; #endif #define CCPU_EXP_MAX 110 /* * This function is analogical to the getpcpu() function in the ps(1) command. * They should both calculate in the same way so that the racct %cpu * calculations are consistent with the values showed by the ps(1) tool. * The calculations are more complex in the 4BSD scheduler because of the value * of the ccpu variable. In ULE it is defined to be zero which saves us some * work. */ static uint64_t racct_getpcpu(struct proc *p, u_int pcpu) { u_int swtime; #ifdef SCHED_4BSD fixpt_t pctcpu, pctcpu_next; #endif #ifdef SMP struct pcpu *pc; int found; #endif fixpt_t p_pctcpu; struct thread *td; ASSERT_RACCT_ENABLED(); /* * If the process is swapped out, we count its %cpu usage as zero. * This behaviour is consistent with the userland ps(1) tool. */ if ((p->p_flag & P_INMEM) == 0) return (0); swtime = (ticks - p->p_swtick) / hz; /* * For short-lived processes, the sched_pctcpu() returns small * values even for cpu intensive processes. Therefore we use * our own estimate in this case. */ if (swtime < RACCT_PCPU_SECS) return (pcpu); p_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { if (td == PCPU_GET(idlethread)) continue; #ifdef SMP found = 0; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (td == pc->pc_idlethread) { found = 1; break; } } if (found) continue; #endif thread_lock(td); #ifdef SCHED_4BSD pctcpu = sched_pctcpu(td); /* Count also the yet unfinished second. */ pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; pctcpu_next += sched_pctcpu_delta(td); p_pctcpu += max(pctcpu, pctcpu_next); #else /* * In ULE the %cpu statistics are updated on every * sched_pctcpu() call. So special calculations to * account for the latest (unfinished) second are * not needed. */ p_pctcpu += sched_pctcpu(td); #endif thread_unlock(td); } #ifdef SCHED_4BSD if (swtime <= CCPU_EXP_MAX) return ((100 * (uint64_t)p_pctcpu * 1000000) / (FSCALE - ccpu_exp[swtime])); #endif return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); } static void racct_add_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); dest->r_resources[i] += src->r_resources[i]; } } static void racct_sub_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); KASSERT(src->r_resources[i] <= dest->r_resources[i], ("%s: resource %d propagation meltdown: src > dest", __func__, i)); } if (RACCT_CAN_DROP(i)) { dest->r_resources[i] -= src->r_resources[i]; if (dest->r_resources[i] < 0) dest->r_resources[i] = 0; } } } void racct_create(struct racct **racctp) { if (!racct_enable) return; SDT_PROBE1(racct, , racct, create, racctp); KASSERT(*racctp == NULL, ("racct already allocated")); *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); } static void racct_destroy_locked(struct racct **racctp) { struct racct *racct; int i; ASSERT_RACCT_ENABLED(); SDT_PROBE1(racct, , racct, destroy, racctp); RACCT_LOCK_ASSERT(); KASSERT(racctp != NULL, ("NULL racctp")); KASSERT(*racctp != NULL, ("NULL racct")); racct = *racctp; for (i = 0; i <= RACCT_MAX; i++) { if (RACCT_IS_SLOPPY(i)) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; KASSERT(racct->r_resources[i] == 0, ("destroying non-empty racct: " "%ju allocated for resource %d\n", racct->r_resources[i], i)); } uma_zfree(racct_zone, racct); *racctp = NULL; } void racct_destroy(struct racct **racct) { if (!racct_enable) return; RACCT_LOCK(); racct_destroy_locked(racct); RACCT_UNLOCK(); } /* * Increase consumption of 'resource' by 'amount' for 'racct', * but not its parents. Differently from other cases, 'amount' here * may be less than zero. */ static void racct_adjust_resource(struct racct *racct, int resource, int64_t amount) { ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(racct != NULL, ("NULL racct")); racct->r_resources[resource] += amount; if (racct->r_resources[resource] < 0) { KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } /* * There are some cases where the racct %cpu resource would grow * beyond 100% per core. For example in racct_proc_exit() we add * the process %cpu usage to the ucred racct containers. If too * many processes terminated in a short time span, the ucred %cpu * resource could grow too much. Also, the 4BSD scheduler sometimes * returns for a thread more than 100% cpu usage. So we set a sane * boundary here to 100% * the maxumum number of CPUs. */ if ((resource == RACCT_PCTCPU) && (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; } static int racct_add_locked(struct proc *p, int resource, uint64_t amount, int force) { #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef RCTL error = rctl_enforce(p, resource, amount); if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); return (error); } #endif racct_adjust_resource(p->p_racct, resource, amount); racct_add_cred_locked(p->p_ucred, resource, amount); return (0); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. */ int racct_add(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); SDT_PROBE3(racct, , rusage, add, p, resource, amount); RACCT_LOCK(); error = racct_add_locked(p, resource, amount, 0); RACCT_UNLOCK(); return (error); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Doesn't check for limits and never fails. */ void racct_add_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); RACCT_LOCK(); racct_add_locked(p, resource, amount, 1); RACCT_UNLOCK(); } static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); } /* * Increase allocation of 'resource' by 'amount' for credential 'cred'. * Doesn't check for limits and never fails. */ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); RACCT_LOCK(); racct_add_cred_locked(cred, resource, amount); RACCT_UNLOCK(); } /* * Account for disk IO resource consumption. Checks for limits, * but never fails, due to disk limits being undeniable. */ void racct_add_buf(struct proc *p, const struct buf *bp, int is_write) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write); RACCT_LOCK(); if (is_write) { racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1); racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1); } else { racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1); racct_add_locked(curproc, RACCT_READIOPS, 1, 1); } RACCT_UNLOCK(); } static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { int64_t old_amount, decayed_amount, diff_proc, diff_cred; #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); old_amount = p->p_racct->r_resources[resource]; /* * The diffs may be negative. */ diff_proc = amount - old_amount; if (resource == RACCT_PCTCPU) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, resource)); #endif #ifdef RCTL if (diff_proc > 0) { error = rctl_enforce(p, resource, diff_proc); if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, set__failure, p, resource, amount); return (error); } } #endif racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); else if (diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); return (0); } /* * Set allocation of 'resource' to 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. * * Note that decreasing the allocation always returns 0, * even if it's above the limit. */ int racct_set(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); SDT_PROBE3(racct, , rusage, set__force, p, resource, amount); RACCT_LOCK(); error = racct_set_locked(p, resource, amount, 0); RACCT_UNLOCK(); return (error); } void racct_set_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, set, p, resource, amount); RACCT_LOCK(); racct_set_locked(p, resource, amount, 1); RACCT_UNLOCK(); } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * not matter. */ uint64_t racct_get_limit(struct proc *p, int resource) { #ifdef RCTL uint64_t available; if (!racct_enable) return (UINT64_MAX); RACCT_LOCK(); available = rctl_get_limit(p, resource); RACCT_UNLOCK(); return (available); #else return (UINT64_MAX); #endif } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * matter. */ uint64_t racct_get_available(struct proc *p, int resource) { #ifdef RCTL uint64_t available; if (!racct_enable) return (UINT64_MAX); RACCT_LOCK(); available = rctl_get_available(p, resource); RACCT_UNLOCK(); return (available); #else return (UINT64_MAX); #endif } /* * Returns amount of the %cpu resource that process 'p' can add to its %cpu * utilization. Adding more than that would lead to the process being * throttled. */ static int64_t racct_pcpu_available(struct proc *p) { #ifdef RCTL uint64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK(); available = rctl_pcpu_available(p); RACCT_UNLOCK(); return (available); #else return (INT64_MAX); #endif } /* * Decrease allocation of 'resource' by 'amount' for process 'p'. */ void racct_sub(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, sub, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(RACCT_CAN_DROP(resource), ("%s: called for non-droppable resource %d", __func__, resource)); RACCT_LOCK(); KASSERT(amount <= p->p_racct->r_resources[resource], ("%s: freeing %ju of resource %d, which is more " "than allocated %jd for %s (pid %d)", __func__, amount, resource, (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); racct_adjust_resource(p->p_racct, resource, -amount); racct_sub_cred_locked(p->p_ucred, resource, amount); RACCT_UNLOCK(); } static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, -amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); } /* * Decrease allocation of 'resource' by 'amount' for credential 'cred'. */ void racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); #ifdef notyet KASSERT(RACCT_CAN_DROP(resource), ("%s: called for resource %d which can not drop", __func__, resource)); #endif RACCT_LOCK(); racct_sub_cred_locked(cred, resource, amount); RACCT_UNLOCK(); } /* * Inherit resource usage information from the parent process. */ int racct_proc_fork(struct proc *parent, struct proc *child) { int i, error = 0; if (!racct_enable) return (0); /* * Create racct for the child process. */ racct_create(&child->p_racct); PROC_LOCK(parent); PROC_LOCK(child); RACCT_LOCK(); #ifdef RCTL error = rctl_proc_fork(parent, child); if (error != 0) goto out; #endif /* Init process cpu time. */ child->p_prev_runtime = 0; child->p_throttled = 0; /* * Inherit resource usage. */ for (i = 0; i <= RACCT_MAX; i++) { if (parent->p_racct->r_resources[i] == 0 || !RACCT_IS_INHERITABLE(i)) continue; error = racct_set_locked(child, i, parent->p_racct->r_resources[i], 0); if (error != 0) goto out; } error = racct_add_locked(child, RACCT_NPROC, 1, 0); error += racct_add_locked(child, RACCT_NTHR, 1, 0); out: RACCT_UNLOCK(); PROC_UNLOCK(child); PROC_UNLOCK(parent); if (error != 0) racct_proc_exit(child); return (error); } /* * Called at the end of fork1(), to handle rules that require the process * to be fully initialized. */ void racct_proc_fork_done(struct proc *child) { if (!racct_enable) return; PROC_LOCK_ASSERT(child, MA_OWNED); #ifdef RCTL RACCT_LOCK(); rctl_enforce(child, RACCT_NPROC, 0); rctl_enforce(child, RACCT_NTHR, 0); RACCT_UNLOCK(); #endif } void racct_proc_exit(struct proc *p) { struct timeval wallclock; uint64_t pct_estimate, pct, runtime; int i; if (!racct_enable) return; PROC_LOCK(p); /* * We don't need to calculate rux, proc_reap() has already done this. */ runtime = cputick2usec(p->p_rux.rux_runtime); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); racct_set_locked(p, RACCT_CPU, runtime, 0); racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, ("process reaped with %ju allocated for RSS\n", p->p_racct->r_resources[RACCT_RSS])); for (i = 0; i <= RACCT_MAX; i++) { if (p->p_racct->r_resources[i] == 0) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; racct_set_locked(p, i, 0, 0); } #ifdef RCTL rctl_racct_release(p->p_racct); #endif racct_destroy_locked(&p->p_racct); RACCT_UNLOCK(); PROC_UNLOCK(p); } /* * Called after credentials change, to move resource utilisation * between raccts. */ void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred) { struct uidinfo *olduip, *newuip; struct loginclass *oldlc, *newlc; struct prison *oldpr, *newpr, *pr; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_NOTOWNED); newuip = newcred->cr_ruidinfo; olduip = oldcred->cr_ruidinfo; newlc = newcred->cr_loginclass; oldlc = oldcred->cr_loginclass; newpr = newcred->cr_prison; oldpr = oldcred->cr_prison; RACCT_LOCK(); if (newuip != olduip) { racct_sub_racct(olduip->ui_racct, p->p_racct); racct_add_racct(newuip->ui_racct, p->p_racct); } if (newlc != oldlc) { racct_sub_racct(oldlc->lc_racct, p->p_racct); racct_add_racct(newlc->lc_racct, p->p_racct); } if (newpr != oldpr) { for (pr = oldpr; pr != NULL; pr = pr->pr_parent) racct_sub_racct(pr->pr_prison_racct->prr_racct, p->p_racct); for (pr = newpr; pr != NULL; pr = pr->pr_parent) racct_add_racct(pr->pr_prison_racct->prr_racct, p->p_racct); } RACCT_UNLOCK(); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); #endif } void racct_move(struct racct *dest, struct racct *src) { ASSERT_RACCT_ENABLED(); RACCT_LOCK(); racct_add_racct(dest, src); racct_sub_racct(src, src); RACCT_UNLOCK(); } /* * Make the process sleep in userret() for 'timeout' ticks. Setting * timeout to -1 makes it sleep until woken up by racct_proc_wakeup(). */ void racct_proc_throttle(struct proc *p, int timeout) { struct thread *td; #ifdef SMP int cpuid; #endif KASSERT(timeout != 0, ("timeout %d", timeout)); ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); /* * Do not block kernel processes. Also do not block processes with * low %cpu utilization to improve interactivity. */ if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) return; if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) return; p->p_throttled = timeout; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); switch (td->td_state) { case TDS_RUNQ: /* * If the thread is on the scheduler run-queue, we can * not just remove it from there. So we set the flag * TDF_NEEDRESCHED for the thread, so that once it is * running, it is taken off the cpu as soon as possible. */ td->td_flags |= TDF_NEEDRESCHED; break; case TDS_RUNNING: /* * If the thread is running, we request a context * switch for it by setting the TDF_NEEDRESCHED flag. */ td->td_flags |= TDF_NEEDRESCHED; #ifdef SMP cpuid = td->td_oncpu; if ((cpuid != NOCPU) && (td != curthread)) ipi_cpu(cpuid, IPI_AST); #endif break; default: break; } thread_unlock(td); } } static void racct_proc_wakeup(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_throttled != 0) { p->p_throttled = 0; wakeup(p->p_racct); } } static void racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) { int64_t r_old, r_new; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); #ifdef RCTL rctl_throttle_decay(racct, RACCT_READBPS); rctl_throttle_decay(racct, RACCT_WRITEBPS); rctl_throttle_decay(racct, RACCT_READIOPS); rctl_throttle_decay(racct, RACCT_WRITEIOPS); #endif r_old = racct->r_resources[RACCT_PCTCPU]; /* If there is nothing to decay, just exit. */ if (r_old <= 0) return; r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; racct->r_resources[RACCT_PCTCPU] = r_new; } static void racct_decay_pre(void) { RACCT_LOCK(); } static void racct_decay_post(void) { RACCT_UNLOCK(); } static void racct_decay(void) { ASSERT_RACCT_ENABLED(); ui_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); prison_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); } static void racctd(void) { struct thread *td; struct proc *p; struct timeval wallclock; uint64_t pct, pct_estimate, runtime; ASSERT_RACCT_ENABLED(); for (;;) { racct_decay(); sx_slock(&allproc_lock); LIST_FOREACH(p, &zombproc, p_list) { PROC_LOCK(p); racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); } FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); continue; } microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) ruxagg(p, td); runtime = cputick2usec(p->p_rux.rux_runtime); PROC_STATUNLOCK(p); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif p->p_prev_runtime = runtime; if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); #ifdef RCTL rctl_throttle_decay(p->p_racct, RACCT_READBPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); rctl_throttle_decay(p->p_racct, RACCT_READIOPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); #endif racct_set_locked(p, RACCT_PCTCPU, pct, 1); racct_set_locked(p, RACCT_CPU, runtime, 0); racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec, 0); RACCT_UNLOCK(); PROC_UNLOCK(p); } /* * To ensure that processes are throttled in a fair way, we need * to iterate over all processes again and check the limits * for %cpu resource only after ucred racct containers have been * properly filled. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); continue; } if (racct_pcpu_available(p) <= 0) { if (p->p_racct->r_resources[RACCT_PCTCPU] > pcpu_threshold) racct_proc_throttle(p, -1); } else if (p->p_throttled == -1) { racct_proc_wakeup(p); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); pause("-", hz); } } static struct kproc_desc racctd_kp = { "racctd", racctd, NULL }; static void racctd_init(void) { if (!racct_enable) return; kproc_start(&racctd_kp); } SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void racct_init(void) { if (!racct_enable) return; racct_zone = uma_zcreate("racct", sizeof(struct racct), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * XXX: Move this somewhere. */ prison0.pr_prison_racct = prison_racct_find("0"); } SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); #endif /* !RACCT */ Index: head/sys/kern/kern_rangelock.c =================================================================== --- head/sys/kern/kern_rangelock.c (revision 326270) +++ head/sys/kern/kern_rangelock.c (revision 326271) @@ -1,248 +1,250 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include struct rl_q_entry { TAILQ_ENTRY(rl_q_entry) rl_q_link; off_t rl_q_start, rl_q_end; int rl_q_flags; }; static uma_zone_t rl_entry_zone; static void rangelock_sys_init(void) { rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL); static struct rl_q_entry * rlqentry_alloc(void) { return (uma_zalloc(rl_entry_zone, M_WAITOK)); } void rlqentry_free(struct rl_q_entry *rleq) { uma_zfree(rl_entry_zone, rleq); } void rangelock_init(struct rangelock *lock) { TAILQ_INIT(&lock->rl_waiters); lock->rl_currdep = NULL; } void rangelock_destroy(struct rangelock *lock) { KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); } /* * Two entries are compatible if their ranges do not overlap, or both * entries are for read. */ static int ranges_overlap(const struct rl_q_entry *e1, const struct rl_q_entry *e2) { if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start) return (1); return (0); } /* * Recalculate the lock->rl_currdep after an unlock. */ static void rangelock_calc_block(struct rangelock *lock) { struct rl_q_entry *entry, *nextentry, *entry1; for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) { nextentry = TAILQ_NEXT(entry, rl_q_link); if (entry->rl_q_flags & RL_LOCK_READ) { /* Reads must not overlap with granted writes. */ for (entry1 = TAILQ_FIRST(&lock->rl_waiters); !(entry1->rl_q_flags & RL_LOCK_READ); entry1 = TAILQ_NEXT(entry1, rl_q_link)) { if (ranges_overlap(entry, entry1)) goto out; } } else { /* Write must not overlap with any granted locks. */ for (entry1 = TAILQ_FIRST(&lock->rl_waiters); entry1 != entry; entry1 = TAILQ_NEXT(entry1, rl_q_link)) { if (ranges_overlap(entry, entry1)) goto out; } /* Move grantable write locks to the front. */ TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link); TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link); } /* Grant this lock. */ entry->rl_q_flags |= RL_LOCK_GRANTED; wakeup(entry); } out: lock->rl_currdep = entry; } static void rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry, struct mtx *ilk) { MPASS(lock != NULL && entry != NULL && ilk != NULL); mtx_assert(ilk, MA_OWNED); KASSERT(entry != lock->rl_currdep, ("stuck currdep")); TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link); rangelock_calc_block(lock); mtx_unlock(ilk); if (curthread->td_rlqe == NULL) curthread->td_rlqe = entry; else rlqentry_free(entry); } void rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk) { MPASS(lock != NULL && cookie != NULL && ilk != NULL); mtx_lock(ilk); rangelock_unlock_locked(lock, cookie, ilk); } /* * Unlock the sub-range of granted lock. */ void * rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start, off_t end, struct mtx *ilk) { struct rl_q_entry *entry; MPASS(lock != NULL && cookie != NULL && ilk != NULL); entry = cookie; KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("Unlocking non-granted lock")); KASSERT(entry->rl_q_start == start, ("wrong start")); KASSERT(entry->rl_q_end >= end, ("wrong end")); mtx_lock(ilk); if (entry->rl_q_end == end) { rangelock_unlock_locked(lock, cookie, ilk); return (NULL); } entry->rl_q_end = end; rangelock_calc_block(lock); mtx_unlock(ilk); return (cookie); } /* * Add the lock request to the queue of the pending requests for * rangelock. Sleep until the request can be granted. */ static void * rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode, struct mtx *ilk) { struct rl_q_entry *entry; struct thread *td; MPASS(lock != NULL && ilk != NULL); td = curthread; if (td->td_rlqe != NULL) { entry = td->td_rlqe; td->td_rlqe = NULL; } else entry = rlqentry_alloc(); MPASS(entry != NULL); entry->rl_q_flags = mode; entry->rl_q_start = start; entry->rl_q_end = end; mtx_lock(ilk); /* * XXXKIB TODO. Check that a thread does not try to enqueue a * lock that is incompatible with another request from the same * thread. */ TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link); if (lock->rl_currdep == NULL) lock->rl_currdep = entry; rangelock_calc_block(lock); while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) msleep(entry, ilk, 0, "range", 0); mtx_unlock(ilk); return (entry); } void * rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk) { return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk)); } void * rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk) { return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk)); } Index: head/sys/kern/kern_rctl.c =================================================================== --- head/sys/kern/kern_rctl.c (revision 326270) +++ head/sys/kern/kern_rctl.c (revision 326271) @@ -1,2233 +1,2235 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #ifndef RACCT #error "The RCTL option requires the RACCT option" #endif FEATURE(rctl, "Resource Limits"); #define HRF_DEFAULT 0 #define HRF_DONT_INHERIT 1 #define HRF_DONT_ACCUMULATE 2 #define RCTL_MAX_INBUFSIZE 4 * 1024 #define RCTL_MAX_OUTBUFSIZE 16 * 1024 * 1024 #define RCTL_LOG_BUFSIZE 128 #define RCTL_PCPU_SHIFT (10 * 1000000) static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; static int rctl_log_rate_limit = 10; static int rctl_devctl_rate_limit = 10; /* * Values below are initialized in rctl_init(). */ static int rctl_throttle_min = -1; static int rctl_throttle_max = -1; static int rctl_throttle_pct = -1; static int rctl_throttle_pct2 = -1; static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, &rctl_maxbufsize, 0, "Maximum output buffer size"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW, &rctl_log_rate_limit, 0, "Maximum number of log messages per second"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN, &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second"); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min, CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU", "Shortest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max, CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU", "Longest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU", "Throttling penalty for process consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU", "Throttling penalty for container consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2); /* * 'rctl_rule_link' connects a rule with every racct it's related to. * For example, rule 'user:X:openfiles:deny=N/process' is linked * with uidinfo for user X, and to each process of that user. */ struct rctl_rule_link { LIST_ENTRY(rctl_rule_link) rrl_next; struct rctl_rule *rrl_rule; int rrl_exceeded; }; struct dict { const char *d_name; int d_value; }; static struct dict subjectnames[] = { { "process", RCTL_SUBJECT_TYPE_PROCESS }, { "user", RCTL_SUBJECT_TYPE_USER }, { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS }, { "jail", RCTL_SUBJECT_TYPE_JAIL }, { NULL, -1 }}; static struct dict resourcenames[] = { { "cputime", RACCT_CPU }, { "datasize", RACCT_DATA }, { "stacksize", RACCT_STACK }, { "coredumpsize", RACCT_CORE }, { "memoryuse", RACCT_RSS }, { "memorylocked", RACCT_MEMLOCK }, { "maxproc", RACCT_NPROC }, { "openfiles", RACCT_NOFILE }, { "vmemoryuse", RACCT_VMEM }, { "pseudoterminals", RACCT_NPTS }, { "swapuse", RACCT_SWAP }, { "nthr", RACCT_NTHR }, { "msgqqueued", RACCT_MSGQQUEUED }, { "msgqsize", RACCT_MSGQSIZE }, { "nmsgq", RACCT_NMSGQ }, { "nsem", RACCT_NSEM }, { "nsemop", RACCT_NSEMOP }, { "nshm", RACCT_NSHM }, { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, { "readbps", RACCT_READBPS }, { "writebps", RACCT_WRITEBPS }, { "readiops", RACCT_READIOPS }, { "writeiops", RACCT_WRITEIOPS }, { NULL, -1 }}; static struct dict actionnames[] = { { "sighup", RCTL_ACTION_SIGHUP }, { "sigint", RCTL_ACTION_SIGINT }, { "sigquit", RCTL_ACTION_SIGQUIT }, { "sigill", RCTL_ACTION_SIGILL }, { "sigtrap", RCTL_ACTION_SIGTRAP }, { "sigabrt", RCTL_ACTION_SIGABRT }, { "sigemt", RCTL_ACTION_SIGEMT }, { "sigfpe", RCTL_ACTION_SIGFPE }, { "sigkill", RCTL_ACTION_SIGKILL }, { "sigbus", RCTL_ACTION_SIGBUS }, { "sigsegv", RCTL_ACTION_SIGSEGV }, { "sigsys", RCTL_ACTION_SIGSYS }, { "sigpipe", RCTL_ACTION_SIGPIPE }, { "sigalrm", RCTL_ACTION_SIGALRM }, { "sigterm", RCTL_ACTION_SIGTERM }, { "sigurg", RCTL_ACTION_SIGURG }, { "sigstop", RCTL_ACTION_SIGSTOP }, { "sigtstp", RCTL_ACTION_SIGTSTP }, { "sigchld", RCTL_ACTION_SIGCHLD }, { "sigttin", RCTL_ACTION_SIGTTIN }, { "sigttou", RCTL_ACTION_SIGTTOU }, { "sigio", RCTL_ACTION_SIGIO }, { "sigxcpu", RCTL_ACTION_SIGXCPU }, { "sigxfsz", RCTL_ACTION_SIGXFSZ }, { "sigvtalrm", RCTL_ACTION_SIGVTALRM }, { "sigprof", RCTL_ACTION_SIGPROF }, { "sigwinch", RCTL_ACTION_SIGWINCH }, { "siginfo", RCTL_ACTION_SIGINFO }, { "sigusr1", RCTL_ACTION_SIGUSR1 }, { "sigusr2", RCTL_ACTION_SIGUSR2 }, { "sigthr", RCTL_ACTION_SIGTHR }, { "deny", RCTL_ACTION_DENY }, { "log", RCTL_ACTION_LOG }, { "devctl", RCTL_ACTION_DEVCTL }, { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; static void rctl_init(void); SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL); static uma_zone_t rctl_rule_zone; static uma_zone_t rctl_rule_link_zone; static int rctl_rule_fully_specified(const struct rctl_rule *rule); static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule); static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits"); static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_min; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 1 || val > rctl_throttle_max) return (EINVAL); RACCT_LOCK(); rctl_throttle_min = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_max; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < rctl_throttle_min) return (EINVAL); RACCT_LOCK(); rctl_throttle_max = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct2; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct2 = val; RACCT_UNLOCK(); return (0); } static const char * rctl_subject_type_name(int subject) { int i; for (i = 0; subjectnames[i].d_name != NULL; i++) { if (subjectnames[i].d_value == subject) return (subjectnames[i].d_name); } panic("rctl_subject_type_name: unknown subject type %d", subject); } static const char * rctl_action_name(int action) { int i; for (i = 0; actionnames[i].d_name != NULL; i++) { if (actionnames[i].d_value == action) return (actionnames[i].d_name); } panic("rctl_action_name: unknown action %d", action); } const char * rctl_resource_name(int resource) { int i; for (i = 0; resourcenames[i].d_name != NULL; i++) { if (resourcenames[i].d_value == resource) return (resourcenames[i].d_name); } panic("rctl_resource_name: unknown resource %d", resource); } static struct racct * rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) { struct ucred *cred = p->p_ucred; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); switch (rule->rr_per) { case RCTL_SUBJECT_TYPE_PROCESS: return (p->p_racct); case RCTL_SUBJECT_TYPE_USER: return (cred->cr_ruidinfo->ui_racct); case RCTL_SUBJECT_TYPE_LOGINCLASS: return (cred->cr_loginclass->lc_racct); case RCTL_SUBJECT_TYPE_JAIL: return (cred->cr_prison->pr_prison_racct->prr_racct); default: panic("%s: unknown per %d", __func__, rule->rr_per); } } /* * Return the amount of resource that can be allocated by 'p' before * hitting 'rule'. */ static int64_t rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) { const struct racct *racct; int64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); racct = rctl_proc_rule_to_racct(p, rule); available = rule->rr_amount - racct->r_resources[rule->rr_resource]; return (available); } /* * Called every second for proc, uidinfo, loginclass, and jail containers. * If the limit isn't exceeded, it decreases the usage amount to zero. * Otherwise, it decreases it by the value of the limit. This way * resource consumption exceeding the limit "carries over" to the next * period. */ void rctl_throttle_decay(struct racct *racct, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t minavailable; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_THROTTLE) continue; if (rule->rr_amount < minavailable) minavailable = rule->rr_amount; } if (racct->r_resources[resource] < minavailable) { racct->r_resources[resource] = 0; } else { /* * Cap utilization counter at ten times the limit. Otherwise, * if we changed the rule lowering the allowed amount, it could * take unreasonably long time for the accumulated resource * usage to drop. */ if (racct->r_resources[resource] > minavailable * 10) racct->r_resources[resource] = minavailable * 10; racct->r_resources[resource] -= minavailable; } } /* * Special version of rctl_get_available() for the %CPU resource. * We slightly cheat here and return less than we normally would. */ int64_t rctl_pcpu_available(const struct proc *p) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, limit; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; limit = 0; LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != RACCT_PCTCPU) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) { minavailable = available; limit = rule->rr_amount; } } /* * Return slightly less than actual value of the available * %cpu resource. This makes %cpu throttling more aggressive * and lets us act sooner than the limits are already exceeded. */ if (limit != 0) { if (limit > 2 * RCTL_PCPU_SHIFT) minavailable -= RCTL_PCPU_SHIFT; else minavailable -= (limit / 2); } return (minavailable); } static uint64_t xadd(uint64_t a, uint64_t b) { uint64_t c; c = a + b; /* * Detect overflow. */ if (c < a || c < b) return (UINT64_MAX); return (c); } static uint64_t xmul(uint64_t a, uint64_t b) { if (b != 0 && a > UINT64_MAX / b) return (UINT64_MAX); return (a * b); } /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should * be denied, 0 otherwise. */ int rctl_enforce(struct proc *p, int resource, uint64_t amount) { static struct timeval log_lasttime, devctl_lasttime; static int log_curtime = 0, devctl_curtime = 0; struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; char *buf; int64_t available; uint64_t sleep_ms, sleep_ratio; int should_deny = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; available = rctl_available_resource(p, rule); if (available >= (int64_t)amount) { link->rrl_exceeded = 0; continue; } switch (rule->rr_action) { case RCTL_ACTION_DENY: should_deny = 1; continue; case RCTL_ACTION_LOG: /* * If rrl_exceeded != 0, it means we've already * logged a warning for this process. */ if (link->rrl_exceeded != 0) continue; /* * If the process state is not fully initialized yet, * we can't access most of the required fields, e.g. * p->p_comm. This happens when called from fork1(). * Ignore this rule for now; it will be processed just * after fork, when called from racct_proc_fork_done(). */ if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&log_lasttime, &log_curtime, rctl_log_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); rctl_rule_to_sbuf(&sb, rule); sbuf_finish(&sb); printf("rctl: rule \"%s\" matched by pid %d " "(%s), uid %d, jail %s\n", sbuf_data(&sb), p->p_pid, p->p_comm, p->p_ucred->cr_uid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_DEVCTL: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, rctl_devctl_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); sbuf_printf(&sb, "rule="); rctl_rule_to_sbuf(&sb, rule); sbuf_printf(&sb, " pid=%d ruid=%d jail=%s", p->p_pid, p->p_ucred->cr_ruid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_finish(&sb); devctl_notify_f("RCTL", "rule", "matched", sbuf_data(&sb), M_NOWAIT); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_THROTTLE: if (p->p_state != PRS_NORMAL) continue; /* * Make the process sleep for a fraction of second * proportional to the ratio of process' resource * utilization compared to the limit. The point is * to penalize resource hogs: processes that consume * more of the available resources sleep for longer. * * We're trying to defer division until the very end, * to minimize the rounding effects. The following * calculation could have been written in a clearer * way like this: * * sleep_ms = hz * p->p_racct->r_resources[resource] / * rule->rr_amount; * sleep_ms *= rctl_throttle_pct / 100; * if (sleep_ms < rctl_throttle_min) * sleep_ms = rctl_throttle_min; * */ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; if (sleep_ms < rctl_throttle_min * rule->rr_amount) sleep_ms = rctl_throttle_min * rule->rr_amount; /* * Multiply that by the ratio of the resource * consumption for the container compared to the limit, * squared. In other words, a process in a container * that is two times over the limit will be throttled * four times as much for hitting the same rule. The * point is to penalize processes more if the container * itself (eg certain UID or jail) is above the limit. */ if (available < 0) sleep_ratio = -available / rule->rr_amount; else sleep_ratio = 0; sleep_ratio = xmul(sleep_ratio, sleep_ratio); sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); /* * Finally the division. */ sleep_ms /= rule->rr_amount; if (sleep_ms > rctl_throttle_max) sleep_ms = rctl_throttle_max; #if 0 printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n", __func__, p->p_pid, p->p_comm, p->p_racct->r_resources[resource], rule->rr_amount, (uintmax_t)sleep_ms, (uintmax_t)sleep_ratio, (intmax_t)available); #endif KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n", __func__, (uintmax_t)sleep_ms, rctl_throttle_min)); racct_proc_throttle(p, sleep_ms); continue; default: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; KASSERT(rule->rr_action > 0 && rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, ("rctl_enforce: unknown action %d", rule->rr_action)); /* * We're using the fact that RCTL_ACTION_SIG* values * are equal to their counterparts from sys/signal.h. */ kern_psignal(p, rule->rr_action); link->rrl_exceeded = 1; continue; } } if (should_deny) { /* * Return fake error code; the caller should change it * into one proper for the situation - EFSIZ, ENOMEM etc. */ return (EDOOFUS); } return (0); } uint64_t rctl_get_limit(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; uint64_t amount = UINT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; if (rule->rr_amount < amount) amount = rule->rr_amount; } return (amount); } uint64_t rctl_get_available(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, allocated; minavailable = INT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) minavailable = available; } /* * XXX: Think about this _hard_. */ allocated = p->p_racct->r_resources[resource]; if (minavailable < INT64_MAX - allocated) minavailable += allocated; if (minavailable < 0) minavailable = 0; return (minavailable); } static int rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter) { ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_subject_type != filter->rr_subject_type) return (0); switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (filter->rr_subject.rs_proc != NULL && rule->rr_subject.rs_proc != filter->rr_subject.rs_proc) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (filter->rr_subject.rs_uip != NULL && rule->rr_subject.rs_uip != filter->rr_subject.rs_uip) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (filter->rr_subject.rs_loginclass != NULL && rule->rr_subject.rs_loginclass != filter->rr_subject.rs_loginclass) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (filter->rr_subject.rs_prison_racct != NULL && rule->rr_subject.rs_prison_racct != filter->rr_subject.rs_prison_racct) return (0); break; default: panic("rctl_rule_matches: unknown subject type %d", filter->rr_subject_type); } } if (filter->rr_resource != RACCT_UNDEFINED) { if (rule->rr_resource != filter->rr_resource) return (0); } if (filter->rr_action != RCTL_ACTION_UNDEFINED) { if (rule->rr_action != filter->rr_action) return (0); } if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) { if (rule->rr_amount != filter->rr_amount) return (0); } if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_per != filter->rr_per) return (0); } return (1); } static int str2value(const char *str, int *value, struct dict *table) { int i; if (value == NULL) return (EINVAL); for (i = 0; table[i].d_name != NULL; i++) { if (strcasecmp(table[i].d_name, str) == 0) { *value = table[i].d_value; return (0); } } return (EINVAL); } static int str2id(const char *str, id_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); return (0); } static int str2int64(const char *str, int64_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); if (*value < 0) return (ERANGE); return (0); } /* * Connect the rule to the racct, increasing refcount for the rule. */ static void rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rctl_rule_acquire(rule); link = uma_zalloc(rctl_rule_link_zone, M_WAITOK); link->rrl_rule = rule; link->rrl_exceeded = 0; RACCT_LOCK(); LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); RACCT_UNLOCK(); } static int rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); RACCT_LOCK_ASSERT(); link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT); if (link == NULL) return (ENOMEM); rctl_rule_acquire(rule); link->rrl_rule = rule; link->rrl_exceeded = 0; LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); return (0); } /* * Remove limits for a rules matching the filter and release * the refcounts for the rules, possibly freeing them. Returns * the number of limit structures removed. */ static int rctl_racct_remove_rules(struct racct *racct, const struct rctl_rule *filter) { struct rctl_rule_link *link, *linktmp; int removed = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); removed++; } return (removed); } static void rctl_rule_acquire_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_hold(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uihold(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_hold(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_acquire_subject: unknown subject type %d", rule->rr_subject_type); } } static void rctl_rule_release_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_free(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uifree(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_free(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_release_subject: unknown subject type %d", rule->rr_subject_type); } } struct rctl_rule * rctl_rule_alloc(int flags) { struct rctl_rule *rule; ASSERT_RACCT_ENABLED(); rule = uma_zalloc(rctl_rule_zone, flags); if (rule == NULL) return (NULL); rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_resource = RACCT_UNDEFINED; rule->rr_action = RCTL_ACTION_UNDEFINED; rule->rr_amount = RCTL_AMOUNT_UNDEFINED; refcount_init(&rule->rr_refcount, 1); return (rule); } struct rctl_rule * rctl_rule_duplicate(const struct rctl_rule *rule, int flags) { struct rctl_rule *copy; ASSERT_RACCT_ENABLED(); copy = uma_zalloc(rctl_rule_zone, flags); if (copy == NULL) return (NULL); copy->rr_subject_type = rule->rr_subject_type; copy->rr_subject.rs_proc = rule->rr_subject.rs_proc; copy->rr_subject.rs_uip = rule->rr_subject.rs_uip; copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass; copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct; copy->rr_per = rule->rr_per; copy->rr_resource = rule->rr_resource; copy->rr_action = rule->rr_action; copy->rr_amount = rule->rr_amount; refcount_init(©->rr_refcount, 1); rctl_rule_acquire_subject(copy); return (copy); } void rctl_rule_acquire(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); refcount_acquire(&rule->rr_refcount); } static void rctl_rule_free(void *context, int pending) { struct rctl_rule *rule; rule = (struct rctl_rule *)context; ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0")); /* * We don't need locking here; rule is guaranteed to be inaccessible. */ rctl_rule_release_subject(rule); uma_zfree(rctl_rule_zone, rule); } void rctl_rule_release(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); if (refcount_release(&rule->rr_refcount)) { /* * rctl_rule_release() is often called when iterating * over all the uidinfo structures in the system, * holding uihashtbl_lock. Since rctl_rule_free() * might end up calling uifree(), this would lead * to lock recursion. Use taskqueue to avoid this. */ TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule); taskqueue_enqueue(taskqueue_thread, &rule->rr_task); } } static int rctl_rule_fully_specified(const struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: return (0); case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) return (0); break; default: panic("rctl_rule_fully_specified: unknown subject type %d", rule->rr_subject_type); } if (rule->rr_resource == RACCT_UNDEFINED) return (0); if (rule->rr_action == RCTL_ACTION_UNDEFINED) return (0); if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED) return (0); if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED) return (0); return (1); } static int rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) { struct rctl_rule *rule; char *subjectstr, *subject_idstr, *resourcestr, *actionstr, *amountstr, *perstr; id_t id; int error = 0; ASSERT_RACCT_ENABLED(); rule = rctl_rule_alloc(M_WAITOK); subjectstr = strsep(&rulestr, ":"); subject_idstr = strsep(&rulestr, ":"); resourcestr = strsep(&rulestr, ":"); actionstr = strsep(&rulestr, "=/"); amountstr = strsep(&rulestr, "/"); perstr = rulestr; if (subjectstr == NULL || subjectstr[0] == '\0') rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(subjectstr, &rule->rr_subject_type, subjectnames); if (error != 0) goto out; } if (subject_idstr == NULL || subject_idstr[0] == '\0') { rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; } else { switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: error = EINVAL; goto out; case RCTL_SUBJECT_TYPE_PROCESS: error = str2id(subject_idstr, &id); if (error != 0) goto out; sx_assert(&allproc_lock, SA_LOCKED); rule->rr_subject.rs_proc = pfind(id); if (rule->rr_subject.rs_proc == NULL) { error = ESRCH; goto out; } PROC_UNLOCK(rule->rr_subject.rs_proc); break; case RCTL_SUBJECT_TYPE_USER: error = str2id(subject_idstr, &id); if (error != 0) goto out; rule->rr_subject.rs_uip = uifind(id); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: rule->rr_subject.rs_loginclass = loginclass_find(subject_idstr); if (rule->rr_subject.rs_loginclass == NULL) { error = ENAMETOOLONG; goto out; } break; case RCTL_SUBJECT_TYPE_JAIL: rule->rr_subject.rs_prison_racct = prison_racct_find(subject_idstr); if (rule->rr_subject.rs_prison_racct == NULL) { error = ENAMETOOLONG; goto out; } break; default: panic("rctl_string_to_rule: unknown subject type %d", rule->rr_subject_type); } } if (resourcestr == NULL || resourcestr[0] == '\0') rule->rr_resource = RACCT_UNDEFINED; else { error = str2value(resourcestr, &rule->rr_resource, resourcenames); if (error != 0) goto out; } if (actionstr == NULL || actionstr[0] == '\0') rule->rr_action = RCTL_ACTION_UNDEFINED; else { error = str2value(actionstr, &rule->rr_action, actionnames); if (error != 0) goto out; } if (amountstr == NULL || amountstr[0] == '\0') rule->rr_amount = RCTL_AMOUNT_UNDEFINED; else { error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) { if (rule->rr_amount > INT64_MAX / 1000000) { error = ERANGE; goto out; } rule->rr_amount *= 1000000; } } if (perstr == NULL || perstr[0] == '\0') rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(perstr, &rule->rr_per, subjectnames); if (error != 0) goto out; } out: if (error == 0) *rulep = rule; else rctl_rule_release(rule); return (error); } /* * Link a rule with all the subjects it applies to. */ int rctl_rule_add(struct rctl_rule *rule) { struct proc *p; struct ucred *cred; struct uidinfo *uip; struct prison *pr; struct prison_racct *prr; struct loginclass *lc; struct rctl_rule *rule2; int match; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* * Some rules just don't make sense, like "deny" rule for an undeniable * resource. The exception are the RSS and %CPU resources - they are * not deniable in the racct sense, but the limit is enforced in * a different way. */ if (rule->rr_action == RCTL_ACTION_DENY && !RACCT_IS_DENIABLE(rule->rr_resource) && rule->rr_resource != RACCT_RSS && rule->rr_resource != RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && !RACCT_IS_DECAYING(rule->rr_resource)) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && rule->rr_resource == RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && RACCT_IS_SLOPPY(rule->rr_resource)) { return (EOPNOTSUPP); } /* * Make sure there are no duplicated rules. Also, for the "deny" * rules, remove ones differing only by "amount". */ if (rule->rr_action == RCTL_ACTION_DENY) { rule2 = rctl_rule_duplicate(rule, M_WAITOK); rule2->rr_amount = RCTL_AMOUNT_UNDEFINED; rctl_rule_remove(rule2); rctl_rule_release(rule2); } else rctl_rule_remove(rule); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = rule->rr_subject.rs_proc; KASSERT(p != NULL, ("rctl_rule_add: NULL proc")); rctl_racct_add_rule(p->p_racct, rule); /* * In case of per-process rule, we don't have anything more * to do. */ return (0); case RCTL_SUBJECT_TYPE_USER: uip = rule->rr_subject.rs_uip; KASSERT(uip != NULL, ("rctl_rule_add: NULL uip")); rctl_racct_add_rule(uip->ui_racct, rule); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = rule->rr_subject.rs_loginclass; KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass")); rctl_racct_add_rule(lc->lc_racct, rule); break; case RCTL_SUBJECT_TYPE_JAIL: prr = rule->rr_subject.rs_prison_racct; KASSERT(prr != NULL, ("rctl_rule_add: NULL pr")); rctl_racct_add_rule(prr->prr_racct, rule); break; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } /* * Now go through all the processes and add the new rule to the ones * it applies to. */ sx_assert(&allproc_lock, SA_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { cred = p->p_ucred; switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_USER: if (cred->cr_uidinfo == rule->rr_subject.rs_uip || cred->cr_ruidinfo == rule->rr_subject.rs_uip) break; continue; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (cred->cr_loginclass == rule->rr_subject.rs_loginclass) break; continue; case RCTL_SUBJECT_TYPE_JAIL: match = 0; for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) { match = 1; break; } } if (match) break; continue; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } rctl_racct_add_rule(p->p_racct, rule); } return (0); } static void rctl_rule_pre_callback(void) { RACCT_LOCK(); } static void rctl_rule_post_callback(void) { RACCT_UNLOCK(); } static void rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; int found = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); found += rctl_racct_remove_rules(racct, filter); *((int *)arg3) += found; } /* * Remove all rules that match the filter. */ int rctl_rule_remove(struct rctl_rule *filter) { struct proc *p; int found = 0; ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS && filter->rr_subject.rs_proc != NULL) { p = filter->rr_subject.rs_proc; RACCT_LOCK(); found = rctl_racct_remove_rules(p->p_racct, filter); RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } loginclass_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); ui_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); prison_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); sx_assert(&allproc_lock, SA_LOCKED); RACCT_LOCK(); FOREACH_PROC_IN_SYSTEM(p) { found += rctl_racct_remove_rules(p->p_racct, filter); } RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } /* * Appends a rule to the sbuf. */ static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) { int64_t amount; ASSERT_RACCT_ENABLED(); sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type)); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_proc->p_pid); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_uip->ui_uid); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_loginclass->lc_name); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_prison_racct->prr_name); break; default: panic("rctl_rule_to_sbuf: unknown subject type %d", rule->rr_subject_type); } amount = rule->rr_amount; if (amount != RCTL_AMOUNT_UNDEFINED && RACCT_IS_IN_MILLIONS(rule->rr_resource)) amount /= 1000000; sbuf_printf(sb, "%s:%s=%jd", rctl_resource_name(rule->rr_resource), rctl_action_name(rule->rr_action), amount); if (rule->rr_per != rule->rr_subject_type) sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per)); } /* * Routine used by RCTL syscalls to read in input string. */ static int rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen) { char *str; int error; ASSERT_RACCT_ENABLED(); if (inbuflen <= 0) return (EINVAL); if (inbuflen > RCTL_MAX_INBUFSIZE) return (E2BIG); str = malloc(inbuflen + 1, M_RCTL, M_WAITOK); error = copyinstr(inbufp, str, inbuflen, NULL); if (error != 0) { free(str, M_RCTL); return (error); } *inputstr = str; return (0); } /* * Routine used by RCTL syscalls to write out output string. */ static int rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen) { int error; ASSERT_RACCT_ENABLED(); if (outputsbuf == NULL) return (0); sbuf_finish(outputsbuf); if (outbuflen < sbuf_len(outputsbuf) + 1) { sbuf_delete(outputsbuf); return (ERANGE); } error = copyout(sbuf_data(outputsbuf), outbufp, sbuf_len(outputsbuf) + 1); sbuf_delete(outputsbuf); return (error); } static struct sbuf * rctl_racct_to_sbuf(struct racct *racct, int sloppy) { struct sbuf *sb; int64_t amount; int i; ASSERT_RACCT_ENABLED(); sb = sbuf_new_auto(); for (i = 0; i <= RACCT_MAX; i++) { if (sloppy == 0 && RACCT_IS_SLOPPY(i)) continue; RACCT_LOCK(); amount = racct->r_resources[i]; RACCT_UNLOCK(); if (RACCT_IS_IN_MILLIONS(i)) amount /= 1000000; sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); } sbuf_setpos(sb, sbuf_len(sb) - 1); return (sb); } int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { struct rctl_rule *filter; struct sbuf *outputsbuf = NULL; struct proc *p; struct uidinfo *uip; struct loginclass *lc; struct prison_racct *prr; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RACCT); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = filter->rr_subject.rs_proc; if (p == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0); break; case RCTL_SUBJECT_TYPE_USER: uip = filter->rr_subject.rs_uip; if (uip == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = filter->rr_subject.rs_loginclass; if (lc == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1); break; case RCTL_SUBJECT_TYPE_JAIL: prr = filter->rr_subject.rs_prison_racct; if (prr == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1); break; default: error = EINVAL; } out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); if (error != 0) return (error); error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen); return (error); } static void rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; struct rctl_rule_link *link; struct sbuf *sb = (struct sbuf *)arg3; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; struct proc *p; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RULES); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); FOREACH_PROC_IN_SYSTEM(p) { RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { /* * Non-process rules will be added to the buffer later. * Adding them here would result in duplicated output. */ if (link->rrl_rule->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) continue; if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } RACCT_UNLOCK(); } loginclass_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); ui_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); prison_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_LIMITS); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EOPNOTSUPP); } if (filter->rr_subject.rs_proc == NULL) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); RACCT_LOCK(); LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links, rrl_next) { rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } RACCT_UNLOCK(); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; sbuf_delete(sb); goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { struct rctl_rule *rule; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_ADD_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &rule); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } /* * The 'per' part of a rule is optional. */ if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED && rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) rule->rr_per = rule->rr_subject_type; if (!rctl_rule_fully_specified(rule)) { error = EINVAL; goto out; } error = rctl_rule_add(rule); out: rctl_rule_release(rule); sx_sunlock(&allproc_lock); return (error); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { struct rctl_rule *filter; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_REMOVE_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } error = rctl_rule_remove(filter); rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (error); } /* * Update RCTL rule list after credential change. */ void rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred) { LIST_HEAD(, rctl_rule_link) newrules; struct rctl_rule_link *link, *newlink; struct uidinfo *newuip; struct loginclass *newlc; struct prison_racct *newprr; int rulecnt, i; ASSERT_RACCT_ENABLED(); newuip = newcred->cr_ruidinfo; newlc = newcred->cr_loginclass; newprr = newcred->cr_prison->pr_prison_racct; LIST_INIT(&newrules); again: /* * First, count the rules that apply to the process with new * credentials. */ rulecnt = 0; RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) rulecnt++; } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) rulecnt++; RACCT_UNLOCK(); /* * Create temporary list. We've dropped the rctl_lock in order * to use M_WAITOK. */ for (i = 0; i < rulecnt; i++) { newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK); newlink->rrl_rule = NULL; newlink->rrl_exceeded = 0; LIST_INSERT_HEAD(&newrules, newlink, rrl_next); } newlink = LIST_FIRST(&newrules); /* * Assign rules to the newly allocated list entries. */ RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } if (rulecnt == 0) { /* * Free the old rule list. */ while (!LIST_EMPTY(&p->p_racct->r_rule_links)) { link = LIST_FIRST(&p->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } /* * Replace lists and we're done. * * XXX: Is there any way to switch list heads instead * of iterating here? */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); LIST_INSERT_HEAD(&p->p_racct->r_rule_links, newlink, rrl_next); } RACCT_UNLOCK(); return; } goaround: RACCT_UNLOCK(); /* * Rule list changed while we were not holding the rctl_lock. * Free the new list and try again. */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); if (newlink->rrl_rule != NULL) rctl_rule_release(newlink->rrl_rule); uma_zfree(rctl_rule_link_zone, newlink); } goto again; } /* * Assign RCTL rules to the newly created process. */ int rctl_proc_fork(struct proc *parent, struct proc *child) { struct rctl_rule *rule; struct rctl_rule_link *link; int error; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent)); LIST_INIT(&child->p_racct->r_rule_links); /* * Go through limits applicable to the parent and assign them * to the child. Rules with 'process' subject have to be duplicated * in order to make their rr_subject point to the new process. */ LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT); if (rule == NULL) goto fail; KASSERT(rule->rr_subject.rs_proc == parent, ("rule->rr_subject.rs_proc != parent")); rule->rr_subject.rs_proc = child; error = rctl_racct_add_rule_locked(child->p_racct, rule); rctl_rule_release(rule); if (error != 0) goto fail; } else { error = rctl_racct_add_rule_locked(child->p_racct, link->rrl_rule); if (error != 0) goto fail; } } return (0); fail: while (!LIST_EMPTY(&child->p_racct->r_rule_links)) { link = LIST_FIRST(&child->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } return (EAGAIN); } /* * Release rules attached to the racct. */ void rctl_racct_release(struct racct *racct) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); while (!LIST_EMPTY(&racct->r_rule_links)) { link = LIST_FIRST(&racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } } static void rctl_init(void) { if (!racct_enable) return; rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rctl_rule_link_zone = uma_zcreate("rctl_rule_link", sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Set default values, making sure not to overwrite the ones * fetched from tunables. Most of those could be set at the * declaration, except for the rctl_throttle_max - we cannot * set it there due to hz not being compile time constant. */ if (rctl_throttle_min < 1) rctl_throttle_min = 1; if (rctl_throttle_max < rctl_throttle_min) rctl_throttle_max = 2 * hz; if (rctl_throttle_pct < 0) rctl_throttle_pct = 100; if (rctl_throttle_pct2 < 0) rctl_throttle_pct2 = 100; } #else /* !RCTL */ int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { return (ENOSYS); } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { return (ENOSYS); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { return (ENOSYS); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { return (ENOSYS); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { return (ENOSYS); } #endif /* !RCTL */ Index: head/sys/kern/kern_rwlock.c =================================================================== --- head/sys/kern/kern_rwlock.c (revision 326270) +++ head/sys/kern/kern_rwlock.c (revision 326271) @@ -1,1464 +1,1466 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Machine independent bits of reader/writer lock implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_rwlocks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS) #define ADAPTIVE_RWLOCKS #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* * Return the rwlock address when the lock cookie address is provided. * This functionality assumes that struct rwlock* have a member named rw_lock. */ #define rwlock2rw(c) (__containerof(c, struct rwlock, rw_lock)) #ifdef DDB #include static void db_show_rwlock(const struct lock_object *lock); #endif static void assert_rw(const struct lock_object *lock, int what); static void lock_rw(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_rw(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_rw(struct lock_object *lock); struct lock_class lock_class_rw = { .lc_name = "rw", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_rw, #ifdef DDB .lc_ddb_show = db_show_rwlock, #endif .lc_lock = lock_rw, .lc_unlock = unlock_rw, #ifdef KDTRACE_HOOKS .lc_owner = owner_rw, #endif }; #ifdef ADAPTIVE_RWLOCKS static int __read_frequently rowner_retries = 10; static int __read_frequently rowner_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, "rwlock debugging"); SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); static struct lock_delay_config __read_frequently rw_delay; SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_base, CTLFLAG_RW, &rw_delay.base, 0, ""); SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max, 0, ""); LOCK_DELAY_SYSINIT_DEFAULT(rw_delay); #endif /* * Return a pointer to the owning thread if the lock is write-locked or * NULL if the lock is unlocked or read-locked. */ #define lv_rw_wowner(v) \ ((v) & RW_LOCK_READ ? NULL : \ (struct thread *)RW_OWNER((v))) #define rw_wowner(rw) lv_rw_wowner(RW_READ_VALUE(rw)) /* * Returns if a write owner is recursed. Write ownership is not assured * here and should be previously checked. */ #define rw_recursed(rw) ((rw)->rw_recurse != 0) /* * Return true if curthread helds the lock. */ #define rw_wlocked(rw) (rw_wowner((rw)) == curthread) /* * Return a pointer to the owning thread for this lock who should receive * any priority lent by threads that block on this lock. Currently this * is identical to rw_wowner(). */ #define rw_owner(rw) rw_wowner(rw) #ifndef INVARIANTS #define __rw_assert(c, what, file, line) #endif void assert_rw(const struct lock_object *lock, int what) { rw_assert((const struct rwlock *)lock, what); } void lock_rw(struct lock_object *lock, uintptr_t how) { struct rwlock *rw; rw = (struct rwlock *)lock; if (how) rw_rlock(rw); else rw_wlock(rw); } uintptr_t unlock_rw(struct lock_object *lock) { struct rwlock *rw; rw = (struct rwlock *)lock; rw_assert(rw, RA_LOCKED | LA_NOTRECURSED); if (rw->rw_lock & RW_LOCK_READ) { rw_runlock(rw); return (1); } else { rw_wunlock(rw); return (0); } } #ifdef KDTRACE_HOOKS int owner_rw(const struct lock_object *lock, struct thread **owner) { const struct rwlock *rw = (const struct rwlock *)lock; uintptr_t x = rw->rw_lock; *owner = rw_wowner(rw); return ((x & RW_LOCK_READ) != 0 ? (RW_READERS(x) != 0) : (*owner != NULL)); } #endif void _rw_init_flags(volatile uintptr_t *c, const char *name, int opts) { struct rwlock *rw; int flags; rw = rwlock2rw(c); MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET | RW_RECURSE | RW_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock, ("%s: rw_lock not aligned for %s: %p", __func__, name, &rw->rw_lock)); flags = LO_UPGRADABLE; if (opts & RW_DUPOK) flags |= LO_DUPOK; if (opts & RW_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & RW_NOWITNESS)) flags |= LO_WITNESS; if (opts & RW_RECURSE) flags |= LO_RECURSABLE; if (opts & RW_QUIET) flags |= LO_QUIET; if (opts & RW_NEW) flags |= LO_NEW; lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags); rw->rw_lock = RW_UNLOCKED; rw->rw_recurse = 0; } void _rw_destroy(volatile uintptr_t *c) { struct rwlock *rw; rw = rwlock2rw(c); KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw)); KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw)); rw->rw_lock = RW_DESTROYED; lock_destroy(&rw->lock_object); } void rw_sysinit(void *arg) { struct rw_args *args; args = arg; rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc, args->ra_flags); } int _rw_wowned(const volatile uintptr_t *c) { return (rw_wowner(rwlock2rw(c)) == curthread); } void _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; uintptr_t tid, v; rw = rwlock2rw(c); KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_wlock() of destroyed rwlock @ %s:%d", file, line)); WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); tid = (uintptr_t)curthread; v = RW_UNLOCKED; if (!_rw_write_lock_fetch(rw, &v, tid)) _rw_wlock_hard(rw, v, file, line); else LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_WRITER); LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line); WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } int __rw_try_wlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t tid, v; int rval; bool recursed; td = curthread; tid = (uintptr_t)td; if (SCHEDULER_STOPPED_TD(td)) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td), ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line)); rval = 1; recursed = false; v = RW_UNLOCKED; for (;;) { if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid)) break; if (v == RW_UNLOCKED) continue; if (v == tid && (rw->lock_object.lo_flags & LO_RECURSABLE)) { rw->rw_recurse++; atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED); break; } rval = 0; break; } LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!recursed) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } int __rw_try_wlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); return (__rw_try_wlock_int(rw LOCK_FILE_LINE_ARG)); } void _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_WLOCKED, file, line); WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line); #ifdef LOCK_PROFILING _rw_wunlock_hard(rw, (uintptr_t)curthread, file, line); #else __rw_wunlock(rw, curthread, file, line); #endif TD_LOCKS_DEC(curthread); } /* * Determines whether a new reader can acquire a lock. Succeeds if the * reader already owns a read lock and the lock is locked for read to * prevent deadlock from reader recursion. Also succeeds if the lock * is unlocked and has no writer waiters or spinners. Failing otherwise * prioritizes writers before readers. */ static bool __always_inline __rw_can_read(struct thread *td, uintptr_t v, bool fp) { if ((v & (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) == RW_LOCK_READ) return (true); if (!fp && td->td_rw_rlocks && (v & RW_LOCK_READ)) return (true); return (false); } static bool __always_inline __rw_rlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp, bool fp LOCK_FILE_LINE_ARG_DEF) { /* * Handle the easy case. If no other thread has a write * lock, then try to bump up the count of read locks. Note * that we have to preserve the current state of the * RW_LOCK_WRITE_WAITERS flag. If we fail to acquire a * read lock, then rw_lock must have changed, so restart * the loop. Note that this handles the case of a * completely unlocked rwlock since such a lock is encoded * as a read lock with no waiters. */ while (__rw_can_read(td, *vp, fp)) { if (atomic_fcmpset_acq_ptr(&rw->rw_lock, vp, *vp + RW_ONE_READER)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, rw, (void *)*vp, (void *)(*vp + RW_ONE_READER)); td->td_rw_rlocks++; return (true); } } return (false); } static void __noinline __rw_rlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v LOCK_FILE_LINE_ARG_DEF) { struct turnstile *ts; struct thread *owner; #ifdef ADAPTIVE_RWLOCKS int spintries = 0; int i, n; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state; int doing_lockprof; #endif if (SCHEDULER_STOPPED()) return; #if defined(ADAPTIVE_RWLOCKS) lock_delay_arg_init(&lda, &rw_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init(&lda, NULL); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&rw->lock_object, &contested, &waittime); #ifdef LOCK_PROFILING doing_lockprof = 1; state = v; #elif defined(KDTRACE_HOOKS) doing_lockprof = lockstat_enabled; if (__predict_false(doing_lockprof)) { all_time -= lockstat_nsecs(&rw->lock_object); state = v; } #endif for (;;) { if (__rw_rlock_try(rw, td, &v, false LOCK_FILE_LINE_ARG)) break; #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_RWLOCKS /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((v & RW_LOCK_READ) == 0) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); do { lock_delay(&lda); v = RW_READ_VALUE(rw); owner = lv_rw_wowner(v); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } else if (spintries < rowner_retries) { spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); for (i = 0; i < rowner_loops; i += n) { n = RW_READERS(v); lock_delay_spin(n); v = RW_READ_VALUE(rw); if ((v & RW_LOCK_READ) == 0 || __rw_can_read(td, v, false)) break; } #ifdef KDTRACE_HOOKS lda.spin_cnt += rowner_loops - i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i != rowner_loops) continue; } #endif /* * Okay, now it's the hard case. Some other thread already * has a write lock or there are write waiters present, * acquire the turnstile lock so we can begin the process * of blocking. */ ts = turnstile_trywait(&rw->lock_object); /* * The lock might have been released while we spun, so * recheck its state and restart the loop if needed. */ v = RW_READ_VALUE(rw); retry_ts: if (__rw_can_read(td, v, false)) { turnstile_cancel(ts); continue; } owner = lv_rw_wowner(v); #ifdef ADAPTIVE_RWLOCKS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (owner != NULL) { if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } } #endif /* * The lock is held in write mode or it already has waiters. */ MPASS(!__rw_can_read(td, v, false)); /* * If the RW_LOCK_READ_WAITERS flag is already set, then * we can go ahead and block. If it is not set then try * to set it. If we fail to set it drop the turnstile * lock and restart the loop. */ if (!(v & RW_LOCK_READ_WAITERS)) { if (!atomic_fcmpset_ptr(&rw->rw_lock, &v, v | RW_LOCK_READ_WAITERS)) goto retry_ts; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set read waiters flag", __func__, rw); } /* * We were unable to acquire the lock and the read waiters * flag is set, so we must block on the turnstile. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&rw->lock_object); #endif MPASS(owner == rw_owner(rw)); turnstile_wait(ts, owner, TS_SHARED_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&rw->lock_object); sleep_cnt++; #endif if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); v = RW_READ_VALUE(rw); } #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&rw->lock_object); if (sleep_time) LOCKSTAT_RECORD4(rw__block, rw, sleep_time, LOCKSTAT_READER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); /* Record only the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time, LOCKSTAT_READER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); #endif /* * TODO: acquire "owner of record" here. Here be turnstile dragons * however. turnstiles don't like owners changing between calls to * turnstile_wait() currently. */ LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested, waittime, file, line, LOCKSTAT_READER); } void __rw_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t v; td = curthread; KASSERT(kdb_active != 0 || SCHEDULER_STOPPED_TD(td) || !TD_IS_IDLETHREAD(td), ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d", td, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_rlock() of destroyed rwlock @ %s:%d", file, line)); KASSERT(rw_wowner(rw) != td, ("rw_rlock: wlock already held for %s @ %s:%d", rw->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL); v = RW_READ_VALUE(rw); if (__predict_false(LOCKSTAT_OOL_PROFILE_ENABLED(rw__acquire) || !__rw_rlock_try(rw, td, &v, true LOCK_FILE_LINE_ARG))) __rw_rlock_hard(rw, td, v LOCK_FILE_LINE_ARG); LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line); WITNESS_LOCK(&rw->lock_object, 0, file, line); TD_LOCKS_INC(curthread); } void __rw_rlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); __rw_rlock_int(rw LOCK_FILE_LINE_ARG); } int __rw_try_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); x = rw->rw_lock; for (;;) { KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line)); if (!(x & RW_LOCK_READ)) break; if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &x, x + RW_ONE_READER)) { LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file, line); WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); curthread->td_rw_rlocks++; return (1); } } LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line); return (0); } int __rw_try_rlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); return (__rw_try_rlock_int(rw LOCK_FILE_LINE_ARG)); } static bool __always_inline __rw_runlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp) { for (;;) { /* * See if there is more than one read lock held. If so, * just drop one and return. */ if (RW_READERS(*vp) > 1) { if (atomic_fcmpset_rel_ptr(&rw->rw_lock, vp, *vp - RW_ONE_READER)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, rw, (void *)*vp, (void *)(*vp - RW_ONE_READER)); td->td_rw_rlocks--; return (true); } continue; } /* * If there aren't any waiters for a write lock, then try * to drop it quickly. */ if (!(*vp & RW_LOCK_WAITERS)) { MPASS((*vp & ~RW_LOCK_WRITE_SPINNER) == RW_READERS_LOCK(1)); if (atomic_fcmpset_rel_ptr(&rw->rw_lock, vp, RW_UNLOCKED)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded", __func__, rw); td->td_rw_rlocks--; return (true); } continue; } break; } return (false); } static void __noinline __rw_runlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v LOCK_FILE_LINE_ARG_DEF) { struct turnstile *ts; uintptr_t x, queue; if (SCHEDULER_STOPPED()) return; for (;;) { if (__rw_runlock_try(rw, td, &v)) break; /* * Ok, we know we have waiters and we think we are the * last reader, so grab the turnstile lock. */ turnstile_chain_lock(&rw->lock_object); v = RW_READ_VALUE(rw); retry_ts: if (__predict_false(RW_READERS(v) > 1)) { turnstile_chain_unlock(&rw->lock_object); continue; } v &= (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER); MPASS(v & RW_LOCK_WAITERS); /* * Try to drop our lock leaving the lock in a unlocked * state. * * If you wanted to do explicit lock handoff you'd have to * do it here. You'd also want to use turnstile_signal() * and you'd have to handle the race where a higher * priority thread blocks on the write lock before the * thread you wakeup actually runs and have the new thread * "steal" the lock. For now it's a lot simpler to just * wakeup all of the waiters. * * As above, if we fail, then another thread might have * acquired a read lock, so drop the turnstile lock and * restart. */ x = RW_UNLOCKED; if (v & RW_LOCK_WRITE_WAITERS) { queue = TS_EXCLUSIVE_QUEUE; x |= (v & RW_LOCK_READ_WAITERS); } else queue = TS_SHARED_QUEUE; v |= RW_READERS_LOCK(1); if (!atomic_fcmpset_rel_ptr(&rw->rw_lock, &v, x)) goto retry_ts; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded with waiters", __func__, rw); /* * Ok. The lock is released and all that's left is to * wake up the waiters. Note that the lock might not be * free anymore, but in that case the writers will just * block again if they run before the new lock holder(s) * release the lock. */ ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); turnstile_broadcast(ts, queue); turnstile_unpend(ts, TS_SHARED_LOCK); turnstile_chain_unlock(&rw->lock_object); td->td_rw_rlocks--; break; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_READER); } void _rw_runlock_cookie_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t v; KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_runlock() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(&rw->rw_lock, RA_RLOCKED, file, line); WITNESS_UNLOCK(&rw->lock_object, 0, file, line); LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line); td = curthread; v = RW_READ_VALUE(rw); if (__predict_false(LOCKSTAT_OOL_PROFILE_ENABLED(rw__release) || !__rw_runlock_try(rw, td, &v))) __rw_runlock_hard(rw, td, v LOCK_FILE_LINE_ARG); TD_LOCKS_DEC(curthread); } void _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); _rw_runlock_cookie_int(rw LOCK_FILE_LINE_ARG); } /* * This function is called when we are unable to obtain a write lock on the * first try. This means that at least one other thread holds either a * read or write lock. */ void __rw_wlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF) { uintptr_t tid; struct rwlock *rw; struct turnstile *ts; struct thread *owner; #ifdef ADAPTIVE_RWLOCKS int spintries = 0; int i, n; #endif uintptr_t x; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state; int doing_lockprof; #endif tid = (uintptr_t)curthread; if (SCHEDULER_STOPPED()) return; #if defined(ADAPTIVE_RWLOCKS) lock_delay_arg_init(&lda, &rw_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init(&lda, NULL); #endif rw = rwlock2rw(c); if (__predict_false(v == RW_UNLOCKED)) v = RW_READ_VALUE(rw); if (__predict_false(lv_rw_wowner(v) == (struct thread *)tid)) { KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE, ("%s: recursing but non-recursive rw %s @ %s:%d\n", __func__, rw->lock_object.lo_name, file, line)); rw->rw_recurse++; atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw); return; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, rw->lock_object.lo_name, (void *)rw->rw_lock, file, line); #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&rw->lock_object, &contested, &waittime); #ifdef LOCK_PROFILING doing_lockprof = 1; state = v; #elif defined(KDTRACE_HOOKS) doing_lockprof = lockstat_enabled; if (__predict_false(doing_lockprof)) { all_time -= lockstat_nsecs(&rw->lock_object); state = v; } #endif for (;;) { if (v == RW_UNLOCKED) { if (_rw_write_lock_fetch(rw, &v, tid)) break; continue; } #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_RWLOCKS /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ owner = lv_rw_wowner(v); if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); do { lock_delay(&lda); v = RW_READ_VALUE(rw); owner = lv_rw_wowner(v); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } if ((v & RW_LOCK_READ) && RW_READERS(v) && spintries < rowner_retries) { if (!(v & RW_LOCK_WRITE_SPINNER)) { if (!atomic_fcmpset_ptr(&rw->rw_lock, &v, v | RW_LOCK_WRITE_SPINNER)) { continue; } } spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); for (i = 0; i < rowner_loops; i += n) { n = RW_READERS(v); lock_delay_spin(n); v = RW_READ_VALUE(rw); if ((v & RW_LOCK_WRITE_SPINNER) == 0) break; } KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); #ifdef KDTRACE_HOOKS lda.spin_cnt += rowner_loops - i; #endif if (i != rowner_loops) continue; } #endif ts = turnstile_trywait(&rw->lock_object); v = RW_READ_VALUE(rw); retry_ts: owner = lv_rw_wowner(v); #ifdef ADAPTIVE_RWLOCKS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (owner != NULL) { if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } } #endif /* * Check for the waiters flags about this rwlock. * If the lock was released, without maintain any pending * waiters queue, simply try to acquire it. * If a pending waiters queue is present, claim the lock * ownership and maintain the pending queue. */ x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER); if ((v & ~x) == RW_UNLOCKED) { x &= ~RW_LOCK_WRITE_SPINNER; if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid | x)) { if (x) turnstile_claim(ts); else turnstile_cancel(ts); break; } goto retry_ts; } /* * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to * set it. If we fail to set it, then loop back and try * again. */ if (!(v & RW_LOCK_WRITE_WAITERS)) { if (!atomic_fcmpset_ptr(&rw->rw_lock, &v, v | RW_LOCK_WRITE_WAITERS)) goto retry_ts; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set write waiters flag", __func__, rw); } /* * We were unable to acquire the lock and the write waiters * flag is set, so we must block on the turnstile. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&rw->lock_object); #endif MPASS(owner == rw_owner(rw)); turnstile_wait(ts, owner, TS_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&rw->lock_object); sleep_cnt++; #endif if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); #ifdef ADAPTIVE_RWLOCKS spintries = 0; #endif v = RW_READ_VALUE(rw); } #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&rw->lock_object); if (sleep_time) LOCKSTAT_RECORD4(rw__block, rw, sleep_time, LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); /* Record only the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time, LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); #endif LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested, waittime, file, line, LOCKSTAT_WRITER); } /* * This function is called if lockstat is active or the first try at releasing * a write lock failed. The latter means that the lock is recursed or one of * the 2 waiter bits must be set indicating that at least one thread is waiting * on this lock. */ void __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF) { struct rwlock *rw; struct turnstile *ts; uintptr_t tid, setv; int queue; tid = (uintptr_t)curthread; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); if (__predict_false(v == tid)) v = RW_READ_VALUE(rw); if (v & RW_LOCK_WRITER_RECURSED) { if (--(rw->rw_recurse) == 0) atomic_clear_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw); return; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_WRITER); if (v == tid && _rw_write_unlock(rw, tid)) return; KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS), ("%s: neither of the waiter flags are set", __func__)); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, rw); turnstile_chain_lock(&rw->lock_object); /* * Use the same algo as sx locks for now. Prefer waking up shared * waiters if we have any over writers. This is probably not ideal. * * 'v' is the value we are going to write back to rw_lock. If we * have waiters on both queues, we need to preserve the state of * the waiter flag for the queue we don't wake up. For now this is * hardcoded for the algorithm mentioned above. * * In the case of both readers and writers waiting we wakeup the * readers but leave the RW_LOCK_WRITE_WAITERS flag set. If a * new writer comes in before a reader it will claim the lock up * above. There is probably a potential priority inversion in * there that could be worked around either by waking both queues * of waiters or doing some complicated lock handoff gymnastics. */ setv = RW_UNLOCKED; v = RW_READ_VALUE(rw); queue = TS_SHARED_QUEUE; if (v & RW_LOCK_WRITE_WAITERS) { queue = TS_EXCLUSIVE_QUEUE; setv |= (v & RW_LOCK_READ_WAITERS); } atomic_store_rel_ptr(&rw->rw_lock, setv); /* Wake up all waiters for the specific queue. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw, queue == TS_SHARED_QUEUE ? "read" : "write"); ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); turnstile_broadcast(ts, queue); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); turnstile_chain_unlock(&rw->lock_object); } /* * Attempt to do a non-blocking upgrade from a read lock to a write * lock. This will only succeed if this thread holds a single read * lock. Returns true if the upgrade succeeded and false otherwise. */ int __rw_try_upgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { uintptr_t v, x, tid; struct turnstile *ts; int success; if (SCHEDULER_STOPPED()) return (1); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(&rw->rw_lock, RA_RLOCKED, file, line); /* * Attempt to switch from one reader to a writer. If there * are any write waiters, then we will have to lock the * turnstile first to prevent races with another writer * calling turnstile_wait() before we have claimed this * turnstile. So, do the simple case of no waiters first. */ tid = (uintptr_t)curthread; success = 0; for (;;) { v = rw->rw_lock; if (RW_READERS(v) > 1) break; if (!(v & RW_LOCK_WAITERS)) { success = atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid); if (!success) continue; break; } /* * Ok, we think we have waiters, so lock the turnstile. */ ts = turnstile_trywait(&rw->lock_object); v = rw->rw_lock; if (RW_READERS(v) > 1) { turnstile_cancel(ts); break; } /* * Try to switch from one reader to a writer again. This time * we honor the current state of the waiters flags. * If we obtain the lock with the flags set, then claim * ownership of the turnstile. */ x = rw->rw_lock & RW_LOCK_WAITERS; success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x); if (success) { if (x) turnstile_claim(ts); else turnstile_cancel(ts); break; } turnstile_cancel(ts); } LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line); if (success) { curthread->td_rw_rlocks--; WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(rw__upgrade, rw); } return (success); } int __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); return (__rw_try_upgrade_int(rw LOCK_FILE_LINE_ARG)); } /* * Downgrade a write lock into a single read lock. */ void __rw_downgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { struct turnstile *ts; uintptr_t tid, v; int rwait, wwait; if (SCHEDULER_STOPPED()) return; KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(&rw->rw_lock, RA_WLOCKED | RA_NOTRECURSED, file, line); #ifndef INVARIANTS if (rw_recursed(rw)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line); /* * Convert from a writer to a single reader. First we handle * the easy case with no waiters. If there are any waiters, we * lock the turnstile and "disown" the lock. */ tid = (uintptr_t)curthread; if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1))) goto out; /* * Ok, we think we have waiters, so lock the turnstile so we can * read the waiter flags without any races. */ turnstile_chain_lock(&rw->lock_object); v = rw->rw_lock & RW_LOCK_WAITERS; rwait = v & RW_LOCK_READ_WAITERS; wwait = v & RW_LOCK_WRITE_WAITERS; MPASS(rwait | wwait); /* * Downgrade from a write lock while preserving waiters flag * and give up ownership of the turnstile. */ ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); if (!wwait) v &= ~RW_LOCK_READ_WAITERS; atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v); /* * Wake other readers if there are no writers pending. Otherwise they * won't be able to acquire the lock anyway. */ if (rwait && !wwait) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); } else turnstile_disown(ts); turnstile_chain_unlock(&rw->lock_object); out: curthread->td_rw_rlocks++; LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(rw__downgrade, rw); } void __rw_downgrade(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); __rw_downgrade_int(rw LOCK_FILE_LINE_ARG); } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef __rw_assert #endif /* * In the non-WITNESS case, rw_assert() can only detect that at least * *some* thread owns an rlock, but it cannot guarantee that *this* * thread owns an rlock. */ void __rw_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct rwlock *rw; if (panicstr != NULL) return; rw = rwlock2rw(c); switch (what) { case RA_LOCKED: case RA_LOCKED | RA_RECURSED: case RA_LOCKED | RA_NOTRECURSED: case RA_RLOCKED: case RA_RLOCKED | RA_RECURSED: case RA_RLOCKED | RA_NOTRECURSED: #ifdef WITNESS witness_assert(&rw->lock_object, what, file, line); #else /* * If some other thread has a write lock or we have one * and are asserting a read lock, fail. Also, if no one * has a lock at all, fail. */ if (rw->rw_lock == RW_UNLOCKED || (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED || rw_wowner(rw) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", rw->lock_object.lo_name, (what & RA_RLOCKED) ? "read " : "", file, line); if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) { if (rw_recursed(rw)) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } #endif break; case RA_WLOCKED: case RA_WLOCKED | RA_RECURSED: case RA_WLOCKED | RA_NOTRECURSED: if (rw_wowner(rw) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", rw->lock_object.lo_name, file, line); if (rw_recursed(rw)) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); break; case RA_UNLOCKED: #ifdef WITNESS witness_assert(&rw->lock_object, what, file, line); #else /* * If we hold a write lock fail. We can't reliably check * to see if we hold a read lock or not. */ if (rw_wowner(rw) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", rw->lock_object.lo_name, file, line); #endif break; default: panic("Unknown rw lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB void db_show_rwlock(const struct lock_object *lock) { const struct rwlock *rw; struct thread *td; rw = (const struct rwlock *)lock; db_printf(" state: "); if (rw->rw_lock == RW_UNLOCKED) db_printf("UNLOCKED\n"); else if (rw->rw_lock == RW_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (rw->rw_lock & RW_LOCK_READ) db_printf("RLOCK: %ju locks\n", (uintmax_t)(RW_READERS(rw->rw_lock))); else { td = rw_wowner(rw); db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (rw_recursed(rw)) db_printf(" recursed: %u\n", rw->rw_recurse); } db_printf(" waiters: "); switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) { case RW_LOCK_READ_WAITERS: db_printf("readers\n"); break; case RW_LOCK_WRITE_WAITERS: db_printf("writers\n"); break; case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS: db_printf("readers and writers\n"); break; default: db_printf("none\n"); break; } } #endif Index: head/sys/kern/kern_sdt.c =================================================================== --- head/sys/kern/kern_sdt.c (revision 326270) +++ head/sys/kern/kern_sdt.c (revision 326271) @@ -1,54 +1,56 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright 2006-2008 John Birrell * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include SDT_PROVIDER_DEFINE(sdt); /* * Hook for the DTrace probe function. The SDT provider will set this to * dtrace_probe() when it loads. */ sdt_probe_func_t sdt_probe_func = sdt_probe_stub; volatile bool __read_frequently sdt_probes_enabled; /* * This is a stub for probe calls in case kernel DTrace support isn't * enabled. It should never get called because there is no DTrace support * to enable it. */ void sdt_probe_stub(uint32_t id, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) { printf("sdt_probe_stub: unexpectedly called\n"); kdb_backtrace(); } Index: head/sys/kern/kern_sema.c =================================================================== --- head/sys/kern/kern_sema.c (revision 326270) +++ head/sys/kern/kern_sema.c (revision 326271) @@ -1,176 +1,178 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (C) 2001 Jason Evans . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * Counting semaphores. * * Priority propagation will not generally raise the priority of semaphore * "owners" (a misnomer in the context of semaphores), so should not be relied * upon in combination with semaphores. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include void sema_init(struct sema *sema, int value, const char *description) { KASSERT((value >= 0), ("%s(): negative value\n", __func__)); bzero(sema, sizeof(*sema)); mtx_init(&sema->sema_mtx, description, "sema backing lock", MTX_DEF | MTX_NOWITNESS | MTX_QUIET); cv_init(&sema->sema_cv, description); sema->sema_value = value; CTR4(KTR_LOCK, "%s(%p, %d, \"%s\")", __func__, sema, value, description); } void sema_destroy(struct sema *sema) { CTR3(KTR_LOCK, "%s(%p) \"%s\"", __func__, sema, cv_wmesg(&sema->sema_cv)); KASSERT((sema->sema_waiters == 0), ("%s(): waiters\n", __func__)); mtx_destroy(&sema->sema_mtx); cv_destroy(&sema->sema_cv); } void _sema_post(struct sema *sema, const char *file, int line) { mtx_lock(&sema->sema_mtx); sema->sema_value++; if (sema->sema_waiters && sema->sema_value > 0) cv_signal(&sema->sema_cv); CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema, cv_wmesg(&sema->sema_cv), sema->sema_value, file, line); mtx_unlock(&sema->sema_mtx); } void _sema_wait(struct sema *sema, const char *file, int line) { mtx_lock(&sema->sema_mtx); while (sema->sema_value == 0) { sema->sema_waiters++; cv_wait(&sema->sema_cv, &sema->sema_mtx); sema->sema_waiters--; } sema->sema_value--; CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema, cv_wmesg(&sema->sema_cv), sema->sema_value, file, line); mtx_unlock(&sema->sema_mtx); } int _sema_timedwait(struct sema *sema, int timo, const char *file, int line) { int error; mtx_lock(&sema->sema_mtx); /* * A spurious wakeup will cause the timeout interval to start over. * This isn't a big deal as long as spurious wakeups don't occur * continuously, since the timeout period is merely a lower bound on how * long to wait. */ for (error = 0; sema->sema_value == 0 && error == 0;) { sema->sema_waiters++; error = cv_timedwait(&sema->sema_cv, &sema->sema_mtx, timo); sema->sema_waiters--; } if (sema->sema_value > 0) { /* Success. */ sema->sema_value--; error = 0; CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema, cv_wmesg(&sema->sema_cv), sema->sema_value, file, line); } else { CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema, cv_wmesg(&sema->sema_cv), file, line); } mtx_unlock(&sema->sema_mtx); return (error); } int _sema_trywait(struct sema *sema, const char *file, int line) { int ret; mtx_lock(&sema->sema_mtx); if (sema->sema_value > 0) { /* Success. */ sema->sema_value--; ret = 1; CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema, cv_wmesg(&sema->sema_cv), sema->sema_value, file, line); } else { ret = 0; CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema, cv_wmesg(&sema->sema_cv), file, line); } mtx_unlock(&sema->sema_mtx); return (ret); } int sema_value(struct sema *sema) { int ret; mtx_lock(&sema->sema_mtx); ret = sema->sema_value; mtx_unlock(&sema->sema_mtx); return (ret); } Index: head/sys/kern/kern_sharedpage.c =================================================================== --- head/sys/kern/kern_sharedpage.c (revision 326270) +++ head/sys/kern/kern_sharedpage.c (revision 326271) @@ -1,288 +1,290 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010, 2012 Konstantin Belousov * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct sx shared_page_alloc_sx; static vm_object_t shared_page_obj; static int shared_page_free; char *shared_page_mapping; void shared_page_write(int base, int size, const void *data) { bcopy(data, shared_page_mapping + base, size); } static int shared_page_alloc_locked(int size, int align) { int res; res = roundup(shared_page_free, align); if (res + size >= IDX_TO_OFF(shared_page_obj->size)) res = -1; else shared_page_free = res + size; return (res); } int shared_page_alloc(int size, int align) { int res; sx_xlock(&shared_page_alloc_sx); res = shared_page_alloc_locked(size, align); sx_xunlock(&shared_page_alloc_sx); return (res); } int shared_page_fill(int size, int align, const void *data) { int res; sx_xlock(&shared_page_alloc_sx); res = shared_page_alloc_locked(size, align); if (res != -1) shared_page_write(res, size, data); sx_xunlock(&shared_page_alloc_sx); return (res); } static void shared_page_init(void *dummy __unused) { vm_page_t m; vm_offset_t addr; sx_init(&shared_page_alloc_sx, "shpsx"); shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE, VM_PROT_DEFAULT, 0, NULL); VM_OBJECT_WLOCK(shared_page_obj); m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_NOBUSY | VM_ALLOC_ZERO); m->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(shared_page_obj); addr = kva_alloc(PAGE_SIZE); pmap_qenter(addr, &m, 1); shared_page_mapping = (char *)addr; } SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init, NULL); /* * Push the timehands update to the shared page. * * The lockless update scheme is similar to the one used to update the * in-kernel timehands, see sys/kern/kern_tc.c:tc_windup() (which * calls us after the timehands are updated). */ static void timehands_update(struct vdso_sv_tk *svtk) { struct vdso_timehands th; struct vdso_timekeep *tk; uint32_t enabled, idx; enabled = tc_fill_vdso_timehands(&th); th.th_gen = 0; idx = svtk->sv_timekeep_curr; if (++idx >= VDSO_TH_NUM) idx = 0; svtk->sv_timekeep_curr = idx; if (++svtk->sv_timekeep_gen == 0) svtk->sv_timekeep_gen = 1; tk = (struct vdso_timekeep *)(shared_page_mapping + svtk->sv_timekeep_off); tk->tk_th[idx].th_gen = 0; atomic_thread_fence_rel(); if (enabled) tk->tk_th[idx] = th; atomic_store_rel_32(&tk->tk_th[idx].th_gen, svtk->sv_timekeep_gen); atomic_store_rel_32(&tk->tk_current, idx); /* * The ordering of the assignment to tk_enabled relative to * the update of the vdso_timehands is not important. */ tk->tk_enabled = enabled; } #ifdef COMPAT_FREEBSD32 static void timehands_update32(struct vdso_sv_tk *svtk) { struct vdso_timehands32 th; struct vdso_timekeep32 *tk; uint32_t enabled, idx; enabled = tc_fill_vdso_timehands32(&th); th.th_gen = 0; idx = svtk->sv_timekeep_curr; if (++idx >= VDSO_TH_NUM) idx = 0; svtk->sv_timekeep_curr = idx; if (++svtk->sv_timekeep_gen == 0) svtk->sv_timekeep_gen = 1; tk = (struct vdso_timekeep32 *)(shared_page_mapping + svtk->sv_timekeep_off); tk->tk_th[idx].th_gen = 0; atomic_thread_fence_rel(); if (enabled) tk->tk_th[idx] = th; atomic_store_rel_32(&tk->tk_th[idx].th_gen, svtk->sv_timekeep_gen); atomic_store_rel_32(&tk->tk_current, idx); tk->tk_enabled = enabled; } #endif /* * This is hackish, but easiest way to avoid creating list structures * that needs to be iterated over from the hardclock interrupt * context. */ static struct vdso_sv_tk *host_svtk; #ifdef COMPAT_FREEBSD32 static struct vdso_sv_tk *compat32_svtk; #endif void timekeep_push_vdso(void) { if (host_svtk != NULL) timehands_update(host_svtk); #ifdef COMPAT_FREEBSD32 if (compat32_svtk != NULL) timehands_update32(compat32_svtk); #endif } struct vdso_sv_tk * alloc_sv_tk(void) { struct vdso_sv_tk *svtk; int tk_base; uint32_t tk_ver; tk_ver = VDSO_TK_VER_CURR; svtk = malloc(sizeof(struct vdso_sv_tk), M_TEMP, M_WAITOK | M_ZERO); tk_base = shared_page_alloc(sizeof(struct vdso_timekeep) + sizeof(struct vdso_timehands) * VDSO_TH_NUM, 16); KASSERT(tk_base != -1, ("tk_base -1 for native")); shared_page_write(tk_base + offsetof(struct vdso_timekeep, tk_ver), sizeof(uint32_t), &tk_ver); svtk->sv_timekeep_off = tk_base; timekeep_push_vdso(); return (svtk); } #ifdef COMPAT_FREEBSD32 struct vdso_sv_tk * alloc_sv_tk_compat32(void) { struct vdso_sv_tk *svtk; int tk_base; uint32_t tk_ver; svtk = malloc(sizeof(struct vdso_sv_tk), M_TEMP, M_WAITOK | M_ZERO); tk_ver = VDSO_TK_VER_CURR; tk_base = shared_page_alloc(sizeof(struct vdso_timekeep32) + sizeof(struct vdso_timehands32) * VDSO_TH_NUM, 16); KASSERT(tk_base != -1, ("tk_base -1 for 32bit")); shared_page_write(tk_base + offsetof(struct vdso_timekeep32, tk_ver), sizeof(uint32_t), &tk_ver); svtk->sv_timekeep_off = tk_base; timekeep_push_vdso(); return (svtk); } #endif void exec_sysvec_init(void *param) { struct sysentvec *sv; sv = (struct sysentvec *)param; if ((sv->sv_flags & SV_SHP) == 0) return; sv->sv_shared_page_obj = shared_page_obj; sv->sv_sigcode_base = sv->sv_shared_page_base + shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode); if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD) return; if ((sv->sv_flags & SV_TIMEKEEP) != 0) { #ifdef COMPAT_FREEBSD32 if ((sv->sv_flags & SV_ILP32) != 0) { KASSERT(compat32_svtk == NULL, ("Compat32 already registered")); compat32_svtk = alloc_sv_tk_compat32(); sv->sv_timekeep_base = sv->sv_shared_page_base + compat32_svtk->sv_timekeep_off; } else { #endif KASSERT(host_svtk == NULL, ("Host already registered")); host_svtk = alloc_sv_tk(); sv->sv_timekeep_base = sv->sv_shared_page_base + host_svtk->sv_timekeep_off; #ifdef COMPAT_FREEBSD32 } #endif } } Index: head/sys/kern/kern_switch.c =================================================================== --- head/sys/kern/kern_switch.c (revision 326270) +++ head/sys/kern/kern_switch.c (revision 326271) @@ -1,541 +1,543 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2001 Jake Burkholder * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include /* Uncomment this to enable logging of critical_enter/exit. */ #if 0 #define KTR_CRITICAL KTR_SCHED #else #define KTR_CRITICAL 0 #endif #ifdef FULL_PREEMPTION #ifndef PREEMPTION #error "The FULL_PREEMPTION option requires the PREEMPTION option" #endif #endif CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); /* * kern.sched.preemption allows user space to determine if preemption support * is compiled in or not. It is not currently a boot or runtime flag that * can be changed. */ #ifdef PREEMPTION static int kern_sched_preemption = 1; #else static int kern_sched_preemption = 0; #endif SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD, &kern_sched_preemption, 0, "Kernel preemption enabled"); /* * Support for scheduler stats exported via kern.sched.stats. All stats may * be reset with kern.sched.stats.reset = 1. Stats may be defined elsewhere * with SCHED_STAT_DEFINE(). */ #ifdef SCHED_STATS SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats"); /* Switch reasons from mi_switch(). */ DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]); SCHED_STAT_DEFINE_VAR(uncategorized, &DPCPU_NAME(sched_switch_stats[SWT_NONE]), ""); SCHED_STAT_DEFINE_VAR(preempt, &DPCPU_NAME(sched_switch_stats[SWT_PREEMPT]), ""); SCHED_STAT_DEFINE_VAR(owepreempt, &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), ""); SCHED_STAT_DEFINE_VAR(turnstile, &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), ""); SCHED_STAT_DEFINE_VAR(sleepq, &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), ""); SCHED_STAT_DEFINE_VAR(sleepqtimo, &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), ""); SCHED_STAT_DEFINE_VAR(relinquish, &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), ""); SCHED_STAT_DEFINE_VAR(needresched, &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), ""); SCHED_STAT_DEFINE_VAR(idle, &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), ""); SCHED_STAT_DEFINE_VAR(iwait, &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), ""); SCHED_STAT_DEFINE_VAR(suspend, &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), ""); SCHED_STAT_DEFINE_VAR(remotepreempt, &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), ""); SCHED_STAT_DEFINE_VAR(remotewakeidle, &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), ""); static int sysctl_stats_reset(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *p; uintptr_t counter; int error; int val; int i; val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == 0) return (0); /* * Traverse the list of children of _kern_sched_stats and reset each * to 0. Skip the reset entry. */ SLIST_FOREACH(p, oidp->oid_parent, oid_link) { if (p == oidp || p->oid_arg1 == NULL) continue; counter = (uintptr_t)p->oid_arg1; CPU_FOREACH(i) { *(long *)(dpcpu_off[i] + counter) = 0; } } return (0); } SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL, 0, sysctl_stats_reset, "I", "Reset scheduler statistics"); #endif /************************************************************************ * Functions that manipulate runnability from a thread perspective. * ************************************************************************/ /* * Select the thread that will be run next. */ static __noinline struct thread * choosethread_panic(struct thread *td) { /* * If we are in panic, only allow system threads, * plus the one we are running in, to be run. */ retry: if (((td->td_proc->p_flag & P_SYSTEM) == 0 && (td->td_flags & TDF_INPANIC) == 0)) { /* note that it is no longer on the run queue */ TD_SET_CAN_RUN(td); td = sched_choose(); goto retry; } TD_SET_RUNNING(td); return (td); } struct thread * choosethread(void) { struct thread *td; td = sched_choose(); if (__predict_false(panicstr != NULL)) return (choosethread_panic(td)); TD_SET_RUNNING(td); return (td); } /* * Kernel thread preemption implementation. Critical sections mark * regions of code in which preemptions are not allowed. * * It might seem a good idea to inline critical_enter() but, in order * to prevent instructions reordering by the compiler, a __compiler_membar() * would have to be used here (the same as sched_pin()). The performance * penalty imposed by the membar could, then, produce slower code than * the function call itself, for most cases. */ void critical_enter(void) { struct thread *td; td = curthread; td->td_critnest++; CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td, (long)td->td_proc->p_pid, td->td_name, td->td_critnest); } void critical_exit(void) { struct thread *td; int flags; td = curthread; KASSERT(td->td_critnest != 0, ("critical_exit: td_critnest == 0")); if (td->td_critnest == 1) { td->td_critnest = 0; /* * Interrupt handlers execute critical_exit() on * leave, and td_owepreempt may be left set by an * interrupt handler only when td_critnest > 0. If we * are decrementing td_critnest from 1 to 0, read * td_owepreempt after decrementing, to not miss the * preempt. Disallow compiler to reorder operations. */ __compiler_membar(); if (td->td_owepreempt && !kdb_active) { /* * Microoptimization: we committed to switch, * disable preemption in interrupt handlers * while spinning for the thread lock. */ td->td_critnest = 1; thread_lock(td); td->td_critnest--; flags = SW_INVOL | SW_PREEMPT; if (TD_IS_IDLETHREAD(td)) flags |= SWT_IDLE; else flags |= SWT_OWEPREEMPT; mi_switch(flags, NULL); thread_unlock(td); } } else td->td_critnest--; CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td, (long)td->td_proc->p_pid, td->td_name, td->td_critnest); } /************************************************************************ * SYSTEM RUN QUEUE manipulations and tests * ************************************************************************/ /* * Initialize a run structure. */ void runq_init(struct runq *rq) { int i; bzero(rq, sizeof *rq); for (i = 0; i < RQ_NQS; i++) TAILQ_INIT(&rq->rq_queues[i]); } /* * Clear the status bit of the queue corresponding to priority level pri, * indicating that it is empty. */ static __inline void runq_clrbit(struct runq *rq, int pri) { struct rqbits *rqb; rqb = &rq->rq_status; CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", rqb->rqb_bits[RQB_WORD(pri)], rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), RQB_BIT(pri), RQB_WORD(pri)); rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); } /* * Find the index of the first non-empty run queue. This is done by * scanning the status bits, a set bit indicates a non-empty queue. */ static __inline int runq_findbit(struct runq *rq) { struct rqbits *rqb; int pri; int i; rqb = &rq->rq_status; for (i = 0; i < RQB_LEN; i++) if (rqb->rqb_bits[i]) { pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", rqb->rqb_bits[i], i, pri); return (pri); } return (-1); } static __inline int runq_findbit_from(struct runq *rq, u_char pri) { struct rqbits *rqb; rqb_word_t mask; int i; /* * Set the mask for the first word so we ignore priorities before 'pri'. */ mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1)); rqb = &rq->rq_status; again: for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) { mask = rqb->rqb_bits[i] & mask; if (mask == 0) continue; pri = RQB_FFS(mask) + (i << RQB_L2BPW); CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d", mask, i, pri); return (pri); } if (pri == 0) return (-1); /* * Wrap back around to the beginning of the list just once so we * scan the whole thing. */ pri = 0; goto again; } /* * Set the status bit of the queue corresponding to priority level pri, * indicating that it is non-empty. */ static __inline void runq_setbit(struct runq *rq, int pri) { struct rqbits *rqb; rqb = &rq->rq_status; CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", rqb->rqb_bits[RQB_WORD(pri)], rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), RQB_BIT(pri), RQB_WORD(pri)); rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); } /* * Add the thread to the queue specified by its priority, and set the * corresponding status bit. */ void runq_add(struct runq *rq, struct thread *td, int flags) { struct rqhead *rqh; int pri; pri = td->td_priority / RQ_PPQ; td->td_rqindex = pri; runq_setbit(rq, pri); rqh = &rq->rq_queues[pri]; CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p", td, td->td_priority, pri, rqh); if (flags & SRQ_PREEMPTED) { TAILQ_INSERT_HEAD(rqh, td, td_runq); } else { TAILQ_INSERT_TAIL(rqh, td, td_runq); } } void runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags) { struct rqhead *rqh; KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri)); td->td_rqindex = pri; runq_setbit(rq, pri); rqh = &rq->rq_queues[pri]; CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p", td, td->td_priority, pri, rqh); if (flags & SRQ_PREEMPTED) { TAILQ_INSERT_HEAD(rqh, td, td_runq); } else { TAILQ_INSERT_TAIL(rqh, td, td_runq); } } /* * Return true if there are runnable processes of any priority on the run * queue, false otherwise. Has no side effects, does not modify the run * queue structure. */ int runq_check(struct runq *rq) { struct rqbits *rqb; int i; rqb = &rq->rq_status; for (i = 0; i < RQB_LEN; i++) if (rqb->rqb_bits[i]) { CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", rqb->rqb_bits[i], i); return (1); } CTR0(KTR_RUNQ, "runq_check: empty"); return (0); } /* * Find the highest priority process on the run queue. */ struct thread * runq_choose_fuzz(struct runq *rq, int fuzz) { struct rqhead *rqh; struct thread *td; int pri; while ((pri = runq_findbit(rq)) != -1) { rqh = &rq->rq_queues[pri]; /* fuzz == 1 is normal.. 0 or less are ignored */ if (fuzz > 1) { /* * In the first couple of entries, check if * there is one for our CPU as a preference. */ int count = fuzz; int cpu = PCPU_GET(cpuid); struct thread *td2; td2 = td = TAILQ_FIRST(rqh); while (count-- && td2) { if (td2->td_lastcpu == cpu) { td = td2; break; } td2 = TAILQ_NEXT(td2, td_runq); } } else td = TAILQ_FIRST(rqh); KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue")); CTR3(KTR_RUNQ, "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh); return (td); } CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri); return (NULL); } /* * Find the highest priority process on the run queue. */ struct thread * runq_choose(struct runq *rq) { struct rqhead *rqh; struct thread *td; int pri; while ((pri = runq_findbit(rq)) != -1) { rqh = &rq->rq_queues[pri]; td = TAILQ_FIRST(rqh); KASSERT(td != NULL, ("runq_choose: no thread on busy queue")); CTR3(KTR_RUNQ, "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh); return (td); } CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri); return (NULL); } struct thread * runq_choose_from(struct runq *rq, u_char idx) { struct rqhead *rqh; struct thread *td; int pri; if ((pri = runq_findbit_from(rq, idx)) != -1) { rqh = &rq->rq_queues[pri]; td = TAILQ_FIRST(rqh); KASSERT(td != NULL, ("runq_choose: no thread on busy queue")); CTR4(KTR_RUNQ, "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p", pri, td, td->td_rqindex, rqh); return (td); } CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri); return (NULL); } /* * Remove the thread from the queue specified by its priority, and clear the * corresponding status bit if the queue becomes empty. * Caller must set state afterwards. */ void runq_remove(struct runq *rq, struct thread *td) { runq_remove_idx(rq, td, NULL); } void runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx) { struct rqhead *rqh; u_char pri; KASSERT(td->td_flags & TDF_INMEM, ("runq_remove_idx: thread swapped out")); pri = td->td_rqindex; KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri)); rqh = &rq->rq_queues[pri]; CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p", td, td->td_priority, pri, rqh); TAILQ_REMOVE(rqh, td, td_runq); if (TAILQ_EMPTY(rqh)) { CTR0(KTR_RUNQ, "runq_remove_idx: empty"); runq_clrbit(rq, pri); if (idx != NULL && *idx == pri) *idx = (pri + 1) % RQ_NQS; } } Index: head/sys/kern/kern_sx.c =================================================================== --- head/sys/kern/kern_sx.c (revision 326270) +++ head/sys/kern/kern_sx.c (revision 326271) @@ -1,1404 +1,1406 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007 Attilio Rao * Copyright (c) 2001 Jason Evans * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * Shared/exclusive locks. This implementation attempts to ensure * deterministic lock granting behavior, so that slocks and xlocks are * interleaved. * * Priority propagation will not generally raise the priority of lock holders, * so should not be relied upon in combination with sx locks. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_sx.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #include #endif #ifdef DDB #include #endif #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #define ADAPTIVE_SX #endif CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE); #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* Handy macros for sleep queues. */ #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 /* * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file. We * drop Giant anytime we have to sleep or if we adaptively spin. */ #define GIANT_DECLARE \ int _giantcnt = 0; \ WITNESS_SAVE_DECL(Giant) \ #define GIANT_SAVE(work) do { \ if (mtx_owned(&Giant)) { \ work++; \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _giantcnt++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) #define GIANT_RESTORE() do { \ if (_giantcnt > 0) { \ mtx_assert(&Giant, MA_NOTOWNED); \ while (_giantcnt--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) /* * Returns true if an exclusive lock is recursed. It assumes * curthread currently has an exclusive lock. */ #define sx_recursed(sx) ((sx)->sx_recurse != 0) static void assert_sx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_sx(const struct lock_object *lock); #endif static void lock_sx(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_sx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_sx(struct lock_object *lock); struct lock_class lock_class_sx = { .lc_name = "sx", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_sx, #ifdef DDB .lc_ddb_show = db_show_sx, #endif .lc_lock = lock_sx, .lc_unlock = unlock_sx, #ifdef KDTRACE_HOOKS .lc_owner = owner_sx, #endif }; #ifndef INVARIANTS #define _sx_assert(sx, what, file, line) #endif #ifdef ADAPTIVE_SX static __read_frequently u_int asx_retries = 10; static __read_frequently u_int asx_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging"); SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); static struct lock_delay_config __read_frequently sx_delay; SYSCTL_INT(_debug_sx, OID_AUTO, delay_base, CTLFLAG_RW, &sx_delay.base, 0, ""); SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max, 0, ""); LOCK_DELAY_SYSINIT_DEFAULT(sx_delay); #endif void assert_sx(const struct lock_object *lock, int what) { sx_assert((const struct sx *)lock, what); } void lock_sx(struct lock_object *lock, uintptr_t how) { struct sx *sx; sx = (struct sx *)lock; if (how) sx_slock(sx); else sx_xlock(sx); } uintptr_t unlock_sx(struct lock_object *lock) { struct sx *sx; sx = (struct sx *)lock; sx_assert(sx, SA_LOCKED | SA_NOTRECURSED); if (sx_xlocked(sx)) { sx_xunlock(sx); return (0); } else { sx_sunlock(sx); return (1); } } #ifdef KDTRACE_HOOKS int owner_sx(const struct lock_object *lock, struct thread **owner) { const struct sx *sx; uintptr_t x; sx = (const struct sx *)lock; x = sx->sx_lock; *owner = NULL; return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) : ((*owner = (struct thread *)SX_OWNER(x)) != NULL)); } #endif void sx_sysinit(void *arg) { struct sx_args *sargs = arg; sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags); } void sx_init_flags(struct sx *sx, const char *description, int opts) { int flags; MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK | SX_NOPROFILE | SX_NOADAPTIVE | SX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock, ("%s: sx_lock not aligned for %s: %p", __func__, description, &sx->sx_lock)); flags = LO_SLEEPABLE | LO_UPGRADABLE; if (opts & SX_DUPOK) flags |= LO_DUPOK; if (opts & SX_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & SX_NOWITNESS)) flags |= LO_WITNESS; if (opts & SX_RECURSE) flags |= LO_RECURSABLE; if (opts & SX_QUIET) flags |= LO_QUIET; if (opts & SX_NEW) flags |= LO_NEW; flags |= opts & SX_NOADAPTIVE; lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags); sx->sx_lock = SX_LOCK_UNLOCKED; sx->sx_recurse = 0; } void sx_destroy(struct sx *sx) { KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held")); KASSERT(sx->sx_recurse == 0, ("sx lock still recursed")); sx->sx_lock = SX_LOCK_DESTROYED; lock_destroy(&sx->lock_object); } int sx_try_slock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); x = sx->sx_lock; for (;;) { KASSERT(x != SX_LOCK_DESTROYED, ("sx_try_slock() of destroyed sx @ %s:%d", file, line)); if (!(x & SX_LOCK_SHARED)) break; if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, x + SX_ONE_SHARER)) { LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line); WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); return (1); } } LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line); return (0); } int sx_try_slock_(struct sx *sx, const char *file, int line) { return (sx_try_slock_int(sx LOCK_FILE_LINE_ARG)); } int _sx_xlock(struct sx *sx, int opts, const char *file, int line) { uintptr_t tid, x; int error = 0; KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("sx_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xlock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); tid = (uintptr_t)curthread; x = SX_LOCK_UNLOCKED; if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) error = _sx_xlock_hard(sx, x, opts LOCK_FILE_LINE_ARG); else LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); if (!error) { LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } return (error); } int sx_try_xlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t tid, x; int rval; bool recursed; td = curthread; tid = (uintptr_t)td; if (SCHEDULER_STOPPED_TD(td)) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td), ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_xlock() of destroyed sx @ %s:%d", file, line)); rval = 1; recursed = false; x = SX_LOCK_UNLOCKED; for (;;) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) break; if (x == SX_LOCK_UNLOCKED) continue; if (x == tid && (sx->lock_object.lo_flags & LO_RECURSABLE)) { sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); break; } rval = 0; break; } LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!recursed) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } int sx_try_xlock_(struct sx *sx, const char *file, int line) { return (sx_try_xlock_int(sx LOCK_FILE_LINE_ARG)); } void _sx_xunlock(struct sx *sx, const char *file, int line) { KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); #if LOCK_DEBUG > 0 _sx_xunlock_hard(sx, (uintptr_t)curthread, file, line); #else __sx_xunlock(sx, curthread, file, line); #endif TD_LOCKS_DEC(curthread); } /* * Try to do a non-blocking upgrade from a shared lock to an exclusive lock. * This will only succeed if this thread holds a single shared lock. * Return 1 if if the upgrade succeed, 0 otherwise. */ int sx_try_upgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; int success; if (SCHEDULER_STOPPED()) return (1); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); /* * Try to switch from one shared lock to an exclusive lock. We need * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that * we will wake up the exclusive waiters when we drop the lock. */ x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS; success = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x, (uintptr_t)curthread | x); LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line); if (success) { WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(sx__upgrade, sx); } return (success); } int sx_try_upgrade_(struct sx *sx, const char *file, int line) { return (sx_try_upgrade_int(sx LOCK_FILE_LINE_ARG)); } /* * Downgrade an unrecursed exclusive lock into a single shared lock. */ void sx_downgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_downgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line); #ifndef INVARIANTS if (sx_recursed(sx)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line); /* * Try to switch from an exclusive lock with no shared waiters * to one sharer with no shared waiters. If there are * exclusive waiters, we don't need to lock the sleep queue so * long as we preserve the flag. We do one quick try and if * that fails we grab the sleepq lock to keep the flags from * changing and do it the slow way. * * We have to lock the sleep queue if there are shared waiters * so we can wake them up. */ x = sx->sx_lock; if (!(x & SX_LOCK_SHARED_WAITERS) && atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS))) goto out; /* * Lock the sleep queue so we can read the waiters bits * without any races and wakeup any shared waiters. */ sleepq_lock(&sx->lock_object); /* * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single * shared lock. If there are any shared waiters, wake them up. */ wakeup_swapper = 0; x = sx->sx_lock; atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS)); if (x & SX_LOCK_SHARED_WAITERS) wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_SHARED_QUEUE); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); out: LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(sx__downgrade, sx); } void sx_downgrade_(struct sx *sx, const char *file, int line) { sx_downgrade_int(sx LOCK_FILE_LINE_ARG); } /* * This function represents the so-called 'hard case' for sx_xlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_xlock_hard(struct sx *sx, uintptr_t x, int opts LOCK_FILE_LINE_ARG_DEF) { GIANT_DECLARE; uintptr_t tid; #ifdef ADAPTIVE_SX volatile struct thread *owner; u_int i, n, spintries = 0; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state; #endif int extra_work = 0; tid = (uintptr_t)curthread; if (SCHEDULER_STOPPED()) return (0); #if defined(ADAPTIVE_SX) lock_delay_arg_init(&lda, &sx_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init(&lda, NULL); #endif if (__predict_false(x == SX_LOCK_UNLOCKED)) x = SX_READ_VALUE(sx); /* If we already hold an exclusive lock, then recurse. */ if (__predict_false(lv_sx_owner(x) == (struct thread *)tid)) { KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0, ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n", sx->lock_object.lo_name, file, line)); sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx); return (0); } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, sx->lock_object.lo_name, (void *)sx->sx_lock, file, line); #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifdef LOCK_PROFILING extra_work = 1; state = x; #elif defined(KDTRACE_HOOKS) extra_work = lockstat_enabled; if (__predict_false(extra_work)) { all_time -= lockstat_nsecs(&sx->lock_object); state = x; } #endif for (;;) { if (x == SX_LOCK_UNLOCKED) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) break; continue; } #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_SX /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { if ((x & SX_LOCK_SHARED) == 0) { owner = lv_sx_owner(x); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(extra_work); do { lock_delay(&lda); x = SX_READ_VALUE(sx); owner = lv_sx_owner(x); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } else if (SX_SHARERS(x) && spintries < asx_retries) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(extra_work); spintries++; for (i = 0; i < asx_loops; i += n) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: shared spinning on %p with %u and %u", __func__, sx, spintries, i); n = SX_SHARERS(x); lock_delay_spin(n); x = SX_READ_VALUE(sx); if ((x & SX_LOCK_SHARED) == 0 || SX_SHARERS(x) == 0) break; } #ifdef KDTRACE_HOOKS lda.spin_cnt += i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i != asx_loops) continue; } } #endif sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); retry_sleepq: /* * If the lock was released while spinning on the * sleep queue chain lock, try again. */ if (x == SX_LOCK_UNLOCKED) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the sleep queue * chain lock. If so, drop the sleep queue lock and try * again. */ if (!(x & SX_LOCK_SHARED) && (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); continue; } } #endif /* * If an exclusive lock was released with both shared * and exclusive waiters and a shared waiter hasn't * woken up and acquired the lock yet, sx_lock will be * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS. * If we see that value, try to acquire it once. Note * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS * as there are other exclusive waiters still. If we * fail, restart the loop. */ if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) { if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid | SX_LOCK_EXCLUSIVE_WAITERS)) goto retry_sleepq; sleepq_release(&sx->lock_object); CTR2(KTR_LOCK, "%s: %p claimed by new writer", __func__, sx); break; } /* * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail, * than loop back and retry. */ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, x | SX_LOCK_EXCLUSIVE_WAITERS)) { goto retry_sleepq; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set excl waiters flag", __func__, sx); } /* * Since we have been unable to acquire the exclusive * lock and the exclusive waiters flag is set, we have * to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif GIANT_SAVE(extra_work); sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); x = SX_READ_VALUE(sx); } #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!extra_work)) return (error); #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); #endif if (!error) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_WRITER); GIANT_RESTORE(); return (error); } /* * This function represents the so-called 'hard case' for sx_xunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_xunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF) { uintptr_t tid, setx; int queue, wakeup_swapper; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; if (__predict_false(x == tid)) x = SX_READ_VALUE(sx); MPASS(!(x & SX_LOCK_SHARED)); if (__predict_false(x & SX_LOCK_RECURSED)) { /* The lock is recursed, unrecurse one level. */ if ((--sx->sx_recurse) == 0) atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx); return; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_WRITER); if (x == tid && atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED)) return; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, sx); sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); MPASS(x & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)); /* * The wake up algorithm here is quite simple and probably not * ideal. It gives precedence to shared waiters if they are * present. For this condition, we have to preserve the * state of the exclusive waiters flag. * If interruptible sleeps left the shared queue empty avoid a * starvation for the threads sleeping on the exclusive queue by giving * them precedence and cleaning up the shared waiters bit anyway. */ setx = SX_LOCK_UNLOCKED; queue = SQ_EXCLUSIVE_QUEUE; if ((x & SX_LOCK_SHARED_WAITERS) != 0 && sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) { queue = SQ_SHARED_QUEUE; setx |= (x & SX_LOCK_EXCLUSIVE_WAITERS); } atomic_store_rel_ptr(&sx->sx_lock, setx); /* Wake up all the waiters for the specific queue. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue", __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, queue); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); } static bool __always_inline __sx_slock_try(struct sx *sx, uintptr_t *xp LOCK_FILE_LINE_ARG_DEF) { /* * If no other thread has an exclusive lock then try to bump up * the count of sharers. Since we have to preserve the state * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the * shared lock loop back and retry. */ while (*xp & SX_LOCK_SHARED) { MPASS(!(*xp & SX_LOCK_SHARED_WAITERS)); if (atomic_fcmpset_acq_ptr(&sx->sx_lock, xp, *xp + SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, sx, (void *)*xp, (void *)(*xp + SX_ONE_SHARER)); return (true); } } return (false); } static int __noinline _sx_slock_hard(struct sx *sx, int opts, uintptr_t x LOCK_FILE_LINE_ARG_DEF) { GIANT_DECLARE; #ifdef ADAPTIVE_SX volatile struct thread *owner; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state; #endif int extra_work = 0; if (SCHEDULER_STOPPED()) return (0); #if defined(ADAPTIVE_SX) lock_delay_arg_init(&lda, &sx_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init(&lda, NULL); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifdef LOCK_PROFILING extra_work = 1; state = x; #elif defined(KDTRACE_HOOKS) extra_work = lockstat_enabled; if (__predict_false(extra_work)) { all_time -= lockstat_nsecs(&sx->lock_object); state = x; } #endif /* * As with rwlocks, we don't make any attempt to try to block * shared locks once there is an exclusive waiter. */ for (;;) { if (__sx_slock_try(sx, &x LOCK_FILE_LINE_ARG)) break; #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = lv_sx_owner(x); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(extra_work); do { lock_delay(&lda); x = SX_READ_VALUE(sx); owner = lv_sx_owner(x); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } #endif /* * Some other thread already has an exclusive lock, so * start the process of blocking. */ sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); retry_sleepq: /* * The lock could have been released while we spun. * In this case loop back and retry. */ if (x & SX_LOCK_SHARED) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if (!(x & SX_LOCK_SHARED) && (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); x = SX_READ_VALUE(sx); continue; } } #endif /* * Try to set the SX_LOCK_SHARED_WAITERS flag. If we * fail to set it drop the sleep queue lock and loop * back. */ if (!(x & SX_LOCK_SHARED_WAITERS)) { if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, x | SX_LOCK_SHARED_WAITERS)) goto retry_sleepq; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set shared waiters flag", __func__, sx); } /* * Since we have been unable to acquire the shared lock, * we have to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif GIANT_SAVE(extra_work); sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); x = SX_READ_VALUE(sx); } #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!extra_work)) return (error); #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); #endif if (error == 0) { LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_READER); } GIANT_RESTORE(); return (error); } int _sx_slock_int(struct sx *sx, int opts LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; int error; KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("sx_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_slock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL); error = 0; x = SX_READ_VALUE(sx); if (__predict_false(LOCKSTAT_OOL_PROFILE_ENABLED(sx__acquire) || !__sx_slock_try(sx, &x LOCK_FILE_LINE_ARG))) error = _sx_slock_hard(sx, opts, x LOCK_FILE_LINE_ARG); if (error == 0) { LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line); WITNESS_LOCK(&sx->lock_object, 0, file, line); TD_LOCKS_INC(curthread); } return (error); } int _sx_slock(struct sx *sx, int opts, const char *file, int line) { return (_sx_slock_int(sx, opts LOCK_FILE_LINE_ARG)); } static bool __always_inline _sx_sunlock_try(struct sx *sx, uintptr_t *xp) { for (;;) { /* * We should never have sharers while at least one thread * holds a shared lock. */ KASSERT(!(*xp & SX_LOCK_SHARED_WAITERS), ("%s: waiting sharers", __func__)); /* * See if there is more than one shared lock held. If * so, just drop one and return. */ if (SX_SHARERS(*xp) > 1) { if (atomic_fcmpset_rel_ptr(&sx->sx_lock, xp, *xp - SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, sx, (void *)*xp, (void *)(*xp - SX_ONE_SHARER)); return (true); } continue; } /* * If there aren't any waiters for an exclusive lock, * then try to drop it quickly. */ if (!(*xp & SX_LOCK_EXCLUSIVE_WAITERS)) { MPASS(*xp == SX_SHARERS_LOCK(1)); *xp = SX_SHARERS_LOCK(1); if (atomic_fcmpset_rel_ptr(&sx->sx_lock, xp, SX_LOCK_UNLOCKED)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded", __func__, sx); return (true); } continue; } break; } return (false); } static void __noinline _sx_sunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF) { int wakeup_swapper; uintptr_t setx; if (SCHEDULER_STOPPED()) return; if (_sx_sunlock_try(sx, &x)) goto out_lockstat; /* * At this point, there should just be one sharer with * exclusive waiters. */ MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS)); sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); for (;;) { MPASS(x & SX_LOCK_EXCLUSIVE_WAITERS); MPASS(!(x & SX_LOCK_SHARED_WAITERS)); /* * Wake up semantic here is quite simple: * Just wake up all the exclusive waiters. * Note that the state of the lock could have changed, * so if it fails loop back and retry. */ setx = x - SX_ONE_SHARER; setx &= ~SX_LOCK_EXCLUSIVE_WAITERS; if (!atomic_fcmpset_rel_ptr(&sx->sx_lock, &x, setx)) continue; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p waking up all thread on" "exclusive queue", __func__, sx); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_EXCLUSIVE_QUEUE); break; } sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); out_lockstat: LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_READER); } void _sx_sunlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_sunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line); x = SX_READ_VALUE(sx); if (__predict_false(LOCKSTAT_OOL_PROFILE_ENABLED(sx__release) || !_sx_sunlock_try(sx, &x))) _sx_sunlock_hard(sx, x LOCK_FILE_LINE_ARG); TD_LOCKS_DEC(curthread); } void _sx_sunlock(struct sx *sx, const char *file, int line) { _sx_sunlock_int(sx LOCK_FILE_LINE_ARG); } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef _sx_assert #endif /* * In the non-WITNESS case, sx_assert() can only detect that at least * *some* thread owns an slock, but it cannot guarantee that *this* * thread owns an slock. */ void _sx_assert(const struct sx *sx, int what, const char *file, int line) { #ifndef WITNESS int slocked = 0; #endif if (panicstr != NULL) return; switch (what) { case SA_SLOCKED: case SA_SLOCKED | SA_NOTRECURSED: case SA_SLOCKED | SA_RECURSED: #ifndef WITNESS slocked = 1; /* FALLTHROUGH */ #endif case SA_LOCKED: case SA_LOCKED | SA_NOTRECURSED: case SA_LOCKED | SA_RECURSED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If some other thread has an exclusive lock or we * have one and are asserting a shared lock, fail. * Also, if no one has a lock at all, fail. */ if (sx->sx_lock == SX_LOCK_UNLOCKED || (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked || sx_xholder(sx) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", sx->lock_object.lo_name, slocked ? "share " : "", file, line); if (!(sx->sx_lock & SX_LOCK_SHARED)) { if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } #endif break; case SA_XLOCKED: case SA_XLOCKED | SA_NOTRECURSED: case SA_XLOCKED | SA_RECURSED: if (sx_xholder(sx) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); break; case SA_UNLOCKED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If we hold an exclusve lock fail. We can't * reliably check to see if we hold a shared lock or * not. */ if (sx_xholder(sx) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); #endif break; default: panic("Unknown sx lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB static void db_show_sx(const struct lock_object *lock) { struct thread *td; const struct sx *sx; sx = (const struct sx *)lock; db_printf(" state: "); if (sx->sx_lock == SX_LOCK_UNLOCKED) db_printf("UNLOCKED\n"); else if (sx->sx_lock == SX_LOCK_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else { td = sx_xholder(sx); db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (sx_recursed(sx)) db_printf(" recursed: %d\n", sx->sx_recurse); } db_printf(" waiters: "); switch(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) { case SX_LOCK_SHARED_WAITERS: db_printf("shared\n"); break; case SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive and shared\n"); break; default: db_printf("none\n"); } } /* * Check to see if a thread that is blocked on a sleep queue is actually * blocked on an sx lock. If so, output some details and return true. * If the lock has an exclusive owner, return that in *ownerp. */ int sx_chain(struct thread *td, struct thread **ownerp) { struct sx *sx; /* * Check to see if this thread is blocked on an sx lock. * First, we check the lock class. If that is ok, then we * compare the lock name against the wait message. */ sx = td->td_wchan; if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx || sx->lock_object.lo_name != td->td_wmesg) return (0); /* We think we have an sx lock, so output some details. */ db_printf("blocked on sx \"%s\" ", td->td_wmesg); *ownerp = sx_xholder(sx); if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK (count %ju)\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else db_printf("XLOCK\n"); return (1); } #endif Index: head/sys/kern/kern_syscalls.c =================================================================== --- head/sys/kern/kern_syscalls.c (revision 326270) +++ head/sys/kern/kern_syscalls.c (revision 326271) @@ -1,231 +1,233 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999 Assar Westerlund * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Acts like "nosys" but can be identified in sysent for dynamic call * number assignment for a limited number of calls. * * Place holder for system call slots reserved for loadable modules. */ int lkmnosys(struct thread *td, struct nosys_args *args) { return (nosys(td, args)); } int lkmressys(struct thread *td, struct nosys_args *args) { return (nosys(td, args)); } static void syscall_thread_drain(struct sysent *se) { u_int32_t cnt, oldcnt; do { oldcnt = se->sy_thrcnt; KASSERT((oldcnt & SY_THR_STATIC) == 0, ("drain on static syscall")); cnt = oldcnt | SY_THR_DRAINING; } while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0); while (atomic_cmpset_32(&se->sy_thrcnt, SY_THR_DRAINING, SY_THR_ABSENT) == 0) pause("scdrn", hz/2); } int syscall_thread_enter(struct thread *td, struct sysent *se) { u_int32_t cnt, oldcnt; do { oldcnt = se->sy_thrcnt; if ((oldcnt & SY_THR_STATIC) != 0) return (0); if ((oldcnt & (SY_THR_DRAINING | SY_THR_ABSENT)) != 0) return (ENOSYS); cnt = oldcnt + SY_THR_INCR; } while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0); return (0); } void syscall_thread_exit(struct thread *td, struct sysent *se) { u_int32_t cnt, oldcnt; do { oldcnt = se->sy_thrcnt; if ((oldcnt & SY_THR_STATIC) != 0) return; cnt = oldcnt - SY_THR_INCR; } while (atomic_cmpset_rel_32(&se->sy_thrcnt, oldcnt, cnt) == 0); } int syscall_register(int *offset, struct sysent *new_sysent, struct sysent *old_sysent, int flags) { int i; if ((flags & ~SY_THR_STATIC) != 0) return (EINVAL); if (*offset == NO_SYSCALL) { for (i = 1; i < SYS_MAXSYSCALL; ++i) if (sysent[i].sy_call == (sy_call_t *)lkmnosys) break; if (i == SYS_MAXSYSCALL) return (ENFILE); *offset = i; } else if (*offset < 0 || *offset >= SYS_MAXSYSCALL) return (EINVAL); else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys && sysent[*offset].sy_call != (sy_call_t *)lkmressys) return (EEXIST); KASSERT(sysent[*offset].sy_thrcnt == SY_THR_ABSENT, ("dynamic syscall is not protected")); *old_sysent = sysent[*offset]; new_sysent->sy_thrcnt = SY_THR_ABSENT; sysent[*offset] = *new_sysent; atomic_store_rel_32(&sysent[*offset].sy_thrcnt, flags); return (0); } int syscall_deregister(int *offset, struct sysent *old_sysent) { struct sysent *se; if (*offset == 0) return (0); /* XXX? */ se = &sysent[*offset]; if ((se->sy_thrcnt & SY_THR_STATIC) != 0) return (EINVAL); syscall_thread_drain(se); sysent[*offset] = *old_sysent; return (0); } int syscall_module_handler(struct module *mod, int what, void *arg) { struct syscall_module_data *data = arg; modspecific_t ms; int error; switch (what) { case MOD_LOAD: error = syscall_register(data->offset, data->new_sysent, &data->old_sysent, data->flags); if (error) { /* Leave a mark so we know to safely unload below. */ data->offset = NULL; return (error); } ms.intval = *data->offset; MOD_XLOCK; module_setspecific(mod, &ms); MOD_XUNLOCK; if (data->chainevh) error = data->chainevh(mod, what, data->chainarg); return (error); case MOD_UNLOAD: /* * MOD_LOAD failed, so just return without calling the * chained handler since we didn't pass along the MOD_LOAD * event. */ if (data->offset == NULL) return (0); if (data->chainevh) { error = data->chainevh(mod, what, data->chainarg); if (error) return error; } error = syscall_deregister(data->offset, &data->old_sysent); return (error); default: if (data->chainevh) return (data->chainevh(mod, what, data->chainarg)); return (EOPNOTSUPP); } /* NOTREACHED */ } int syscall_helper_register(struct syscall_helper_data *sd, int flags) { struct syscall_helper_data *sd1; int error; for (sd1 = sd; sd1->syscall_no != NO_SYSCALL; sd1++) { error = syscall_register(&sd1->syscall_no, &sd1->new_sysent, &sd1->old_sysent, flags); if (error != 0) { syscall_helper_unregister(sd); return (error); } sd1->registered = 1; } return (0); } int syscall_helper_unregister(struct syscall_helper_data *sd) { struct syscall_helper_data *sd1; for (sd1 = sd; sd1->registered != 0; sd1++) { syscall_deregister(&sd1->syscall_no, &sd1->old_sysent); sd1->registered = 0; } return (0); } Index: head/sys/kern/kern_thr.c =================================================================== --- head/sys/kern/kern_thr.c (revision 326270) +++ head/sys/kern/kern_thr.c (revision 326271) @@ -1,616 +1,618 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation"); static int max_threads_per_proc = 1500; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW, &max_threads_per_proc, 0, "Limit on threads per proc"); static int max_threads_hits; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD, &max_threads_hits, 0, "kern.threads.max_threads_per_proc hit count"); #ifdef COMPAT_FREEBSD32 static inline int suword_lwpid(void *addr, lwpid_t lwpid) { int error; if (SV_CURPROC_FLAG(SV_LP64)) error = suword(addr, lwpid); else error = suword32(addr, lwpid); return (error); } #else #define suword_lwpid suword #endif /* * System call interface. */ struct thr_create_initthr_args { ucontext_t ctx; long *tid; }; static int thr_create_initthr(struct thread *td, void *thunk) { struct thr_create_initthr_args *args; /* Copy out the child tid. */ args = thunk; if (args->tid != NULL && suword_lwpid(args->tid, td->td_tid)) return (EFAULT); return (set_mcontext(td, &args->ctx.uc_mcontext)); } int sys_thr_create(struct thread *td, struct thr_create_args *uap) /* ucontext_t *ctx, long *id, int flags */ { struct thr_create_initthr_args args; int error; if ((error = copyin(uap->ctx, &args.ctx, sizeof(args.ctx)))) return (error); args.tid = uap->id; return (thread_create(td, NULL, thr_create_initthr, &args)); } int sys_thr_new(struct thread *td, struct thr_new_args *uap) /* struct thr_param * */ { struct thr_param param; int error; if (uap->param_size < 0 || uap->param_size > sizeof(param)) return (EINVAL); bzero(¶m, sizeof(param)); if ((error = copyin(uap->param, ¶m, uap->param_size))) return (error); return (kern_thr_new(td, ¶m)); } static int thr_new_initthr(struct thread *td, void *thunk) { stack_t stack; struct thr_param *param; /* * Here we copy out tid to two places, one for child and one * for parent, because pthread can create a detached thread, * if parent wants to safely access child tid, it has to provide * its storage, because child thread may exit quickly and * memory is freed before parent thread can access it. */ param = thunk; if ((param->child_tid != NULL && suword_lwpid(param->child_tid, td->td_tid)) || (param->parent_tid != NULL && suword_lwpid(param->parent_tid, td->td_tid))) return (EFAULT); /* Set up our machine context. */ stack.ss_sp = param->stack_base; stack.ss_size = param->stack_size; /* Set upcall address to user thread entry function. */ cpu_set_upcall(td, param->start_func, param->arg, &stack); /* Setup user TLS address and TLS pointer register. */ return (cpu_set_user_tls(td, param->tls_base)); } int kern_thr_new(struct thread *td, struct thr_param *param) { struct rtprio rtp, *rtpp; int error; rtpp = NULL; if (param->rtp != 0) { error = copyin(param->rtp, &rtp, sizeof(struct rtprio)); if (error) return (error); rtpp = &rtp; } return (thread_create(td, rtpp, thr_new_initthr, param)); } int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk) { struct thread *newtd; struct proc *p; int error; p = td->td_proc; if (rtp != NULL) { switch(rtp->type) { case RTP_PRIO_REALTIME: case RTP_PRIO_FIFO: /* Only root can set scheduler policy */ if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0) return (EPERM); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); break; case RTP_PRIO_NORMAL: rtp->prio = 0; break; default: return (EINVAL); } } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(p); if (error != 0) return (EPROCLIM); } #endif /* Initialize our td */ error = kern_thr_alloc(p, 0, &newtd); if (error) goto fail; cpu_copy_thread(newtd, td); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = td->td_proc; newtd->td_rb_list = newtd->td_rbp_list = newtd->td_rb_inact = 0; thread_cow_get(newtd, td); error = initialize_thread(newtd, thunk); if (error != 0) { thread_cow_free(newtd); thread_free(newtd); goto fail; } PROC_LOCK(p); p->p_flag |= P_HADTHREADS; thread_link(newtd, p); bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (p->p_ptevents & PTRACE_LWP) newtd->td_dbgflags |= TDB_BORN; /* * Copy the existing thread VM policy into the new thread. */ vm_domain_policy_localcopy(&newtd->td_vm_dom_policy, &td->td_vm_dom_policy); PROC_UNLOCK(p); tidhash_add(newtd); thread_lock(newtd); if (rtp != NULL) { if (!(td->td_pri_class == PRI_TIMESHARE && rtp->type == RTP_PRIO_NORMAL)) { rtp_to_pri(rtp, newtd); sched_prio(newtd, newtd->td_user_pri); } /* ignore timesharing class */ } TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); return (0); fail: #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); } #endif return (error); } int sys_thr_self(struct thread *td, struct thr_self_args *uap) /* long *id */ { int error; error = suword_lwpid(uap->id, (unsigned)td->td_tid); if (error == -1) return (EFAULT); return (0); } int sys_thr_exit(struct thread *td, struct thr_exit_args *uap) /* long *state */ { umtx_thread_exit(td); /* Signal userland that it can free the stack. */ if ((void *)uap->state != NULL) { suword_lwpid(uap->state, 1); kern_umtx_wake(td, uap->state, INT_MAX, 0); } return (kern_thr_exit(td)); } int kern_thr_exit(struct thread *td) { struct proc *p; p = td->td_proc; /* * If all of the threads in a process call this routine to * exit (e.g. all threads call pthread_exit()), exactly one * thread should return to the caller to terminate the process * instead of the thread. * * Checking p_numthreads alone is not sufficient since threads * might be committed to terminating while the PROC_LOCK is * dropped in either ptracestop() or while removing this thread * from the tidhash. Instead, the p_pendingexits field holds * the count of threads in either of those states and a thread * is considered the "last" thread if all of the other threads * in a process are already terminating. */ PROC_LOCK(p); if (p->p_numthreads == p->p_pendingexits + 1) { /* * Ignore attempts to shut down last thread in the * proc. This will actually call _exit(2) in the * usermode trampoline when it returns. */ PROC_UNLOCK(p); return (0); } p->p_pendingexits++; td->td_dbgflags |= TDB_EXIT; if (p->p_ptevents & PTRACE_LWP) ptracestop(td, SIGTRAP, NULL); PROC_UNLOCK(p); tidhash_remove(td); PROC_LOCK(p); p->p_pendingexits--; /* * The check above should prevent all other threads from this * process from exiting while the PROC_LOCK is dropped, so * there must be at least one other thread other than the * current thread. */ KASSERT(p->p_numthreads > 1, ("too few threads")); racct_sub(p, RACCT_NTHR, 1); tdsigcleanup(td); PROC_SLOCK(p); thread_stopped(p); thread_exit(); /* NOTREACHED */ } int sys_thr_kill(struct thread *td, struct thr_kill_args *uap) /* long id, int sig */ { ksiginfo_t ksi; struct thread *ttd; struct proc *p; int error; p = td->td_proc; ksiginfo_init(&ksi); ksi.ksi_signo = uap->sig; ksi.ksi_code = SI_LWP; ksi.ksi_pid = p->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->id == -1) { if (uap->sig != 0 && !_SIG_VALID(uap->sig)) { error = EINVAL; } else { error = ESRCH; PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, ttd) { if (ttd != td) { error = 0; if (uap->sig == 0) break; tdksignal(ttd, uap->sig, &ksi); } } PROC_UNLOCK(p); } } else { error = 0; ttd = tdfind((lwpid_t)uap->id, p->p_pid); if (ttd == NULL) return (ESRCH); if (uap->sig == 0) ; else if (!_SIG_VALID(uap->sig)) error = EINVAL; else tdksignal(ttd, uap->sig, &ksi); PROC_UNLOCK(ttd->td_proc); } return (error); } int sys_thr_kill2(struct thread *td, struct thr_kill2_args *uap) /* pid_t pid, long id, int sig */ { ksiginfo_t ksi; struct thread *ttd; struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->sig); ksiginfo_init(&ksi); ksi.ksi_signo = uap->sig; ksi.ksi_code = SI_LWP; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->id == -1) { if ((p = pfind(uap->pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->sig); if (error) { PROC_UNLOCK(p); return (error); } if (uap->sig != 0 && !_SIG_VALID(uap->sig)) { error = EINVAL; } else { error = ESRCH; FOREACH_THREAD_IN_PROC(p, ttd) { if (ttd != td) { error = 0; if (uap->sig == 0) break; tdksignal(ttd, uap->sig, &ksi); } } } PROC_UNLOCK(p); } else { ttd = tdfind((lwpid_t)uap->id, uap->pid); if (ttd == NULL) return (ESRCH); p = ttd->td_proc; AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->sig); if (uap->sig == 0) ; else if (!_SIG_VALID(uap->sig)) error = EINVAL; else tdksignal(ttd, uap->sig, &ksi); PROC_UNLOCK(p); } return (error); } int sys_thr_suspend(struct thread *td, struct thr_suspend_args *uap) /* const struct timespec *timeout */ { struct timespec ts, *tsp; int error; tsp = NULL; if (uap->timeout != NULL) { error = umtx_copyin_timeout(uap->timeout, &ts); if (error != 0) return (error); tsp = &ts; } return (kern_thr_suspend(td, tsp)); } int kern_thr_suspend(struct thread *td, struct timespec *tsp) { struct proc *p = td->td_proc; struct timeval tv; int error = 0; int timo = 0; if (td->td_pflags & TDP_WAKEUP) { td->td_pflags &= ~TDP_WAKEUP; return (0); } if (tsp != NULL) { if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) error = EWOULDBLOCK; else { TIMESPEC_TO_TIMEVAL(&tv, tsp); timo = tvtohz(&tv); } } PROC_LOCK(p); if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0) error = msleep((void *)td, &p->p_mtx, PCATCH, "lthr", timo); if (td->td_flags & TDF_THRWAKEUP) { thread_lock(td); td->td_flags &= ~TDF_THRWAKEUP; thread_unlock(td); PROC_UNLOCK(p); return (0); } PROC_UNLOCK(p); if (error == EWOULDBLOCK) error = ETIMEDOUT; else if (error == ERESTART) { if (timo != 0) error = EINTR; } return (error); } int sys_thr_wake(struct thread *td, struct thr_wake_args *uap) /* long id */ { struct proc *p; struct thread *ttd; if (uap->id == td->td_tid) { td->td_pflags |= TDP_WAKEUP; return (0); } p = td->td_proc; ttd = tdfind((lwpid_t)uap->id, p->p_pid); if (ttd == NULL) return (ESRCH); thread_lock(ttd); ttd->td_flags |= TDF_THRWAKEUP; thread_unlock(ttd); wakeup((void *)ttd); PROC_UNLOCK(p); return (0); } int sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap) { struct proc *p; char name[MAXCOMLEN + 1]; struct thread *ttd; int error; error = 0; name[0] = '\0'; if (uap->name != NULL) { error = copyinstr(uap->name, name, sizeof(name), NULL); if (error == ENAMETOOLONG) { error = copyin(uap->name, name, sizeof(name) - 1); name[sizeof(name) - 1] = '\0'; } if (error) return (error); } p = td->td_proc; ttd = tdfind((lwpid_t)uap->id, p->p_pid); if (ttd == NULL) return (ESRCH); strcpy(ttd->td_name, name); #ifdef KTR sched_clear_tdname(ttd); #endif PROC_UNLOCK(p); return (error); } int kern_thr_alloc(struct proc *p, int pages, struct thread **ntd) { /* Have race condition but it is cheap. */ if (p->p_numthreads >= max_threads_per_proc) { ++max_threads_hits; return (EPROCLIM); } *ntd = thread_alloc(pages); if (*ntd == NULL) return (ENOMEM); return (0); } Index: head/sys/kern/kern_thread.c =================================================================== --- head/sys/kern/kern_thread.c (revision 326270) +++ head/sys/kern/kern_thread.c (revision 326271) @@ -1,1265 +1,1267 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include /* * Asserts below verify the stability of struct thread and struct proc * layout, as exposed by KBI to modules. On head, the KBI is allowed * to drift, change to the structures must be accompanied by the * assert update. * * On the stable branches after KBI freeze, conditions must not be * violated. Typically new fields are moved to the end of the * structures. */ #ifdef __amd64__ _Static_assert(offsetof(struct thread, td_flags) == 0xf4, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xfc, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x460, "struct thread KBI td_frame"); _Static_assert(offsetof(struct thread, td_emuldata) == 0x508, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0xbc, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x3d0, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x3e0, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ _Static_assert(offsetof(struct thread, td_flags) == 0x9c, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa4, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x2ec, "struct thread KBI td_frame"); _Static_assert(offsetof(struct thread, td_emuldata) == 0x338, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0x74, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x27c, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x288, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x314, "struct proc KBI p_emuldata"); #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); static struct mtx zombie_lock; MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); static void thread_zombie(struct thread *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); #define TID_BUFFER_SIZE 1024 struct mtx tid_lock; static struct unrhdr *tid_unrhdr; static lwpid_t tid_buffer[TID_BUFFER_SIZE]; static int tid_head, tid_tail; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); struct tidhashhead *tidhashtbl; u_long tidhash; struct rwlock tidhash_lock; EVENTHANDLER_LIST_DEFINE(thread_ctor); EVENTHANDLER_LIST_DEFINE(thread_dtor); EVENTHANDLER_LIST_DEFINE(thread_init); EVENTHANDLER_LIST_DEFINE(thread_fini); static lwpid_t tid_alloc(void) { lwpid_t tid; tid = alloc_unr(tid_unrhdr); if (tid != -1) return (tid); mtx_lock(&tid_lock); if (tid_head == tid_tail) { mtx_unlock(&tid_lock); return (-1); } tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); return (tid); } static void tid_free(lwpid_t tid) { lwpid_t tmp_tid = -1; mtx_lock(&tid_lock); if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) { tmp_tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; } tid_buffer[tid_tail] = tid; tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); if (tmp_tid != -1) free_unr(tid_unrhdr, tmp_tid); } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_tid = tid_alloc(); /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); #ifdef AUDIT audit_thread_alloc(td); #endif umtx_thread_alloc(td); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); td_softdep_cleanup(td); MPASS(td->td_su == NULL); EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td); tid_free(td->td_tid); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_DIRECT_INVOKE(thread_init, td); umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_DIRECT_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); seltdfini(td); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } /* * Initialize global thread allocation resources. */ void threadinit(void) { mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); /* * pid_max cannot be greater than PID_MAX. * leave one number for thread0. */ tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock); thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 32 - 1, UMA_ZONE_NOFREE); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); rw_init(&tidhash_lock, "tidhash"); } /* * Place an unused thread on the zombie list. * Use the slpq as that must be unused by now. */ void thread_zombie(struct thread *td) { mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); mtx_unlock_spin(&zombie_lock); } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombie resources. */ void thread_reap(void) { struct thread *td_first, *td_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); thread_cow_free(td_first); thread_free(td_first); td_first = td_next; } } } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; thread_reap(); /* check if any zombies to get */ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); return (NULL); } cpu_thread_alloc(td); vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); vm_domain_policy_cleanup(&td->td_vm_dom_policy); callout_drain(&td->td_slpcallout); uma_zfree(thread_zone, td); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { newtd->td_ucred = crhold(td->td_ucred); newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_ucred != NULL) crfree(td->td_ucred); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldcred = NULL; oldlimit = NULL; PROC_LOCK(p); if (td->td_ucred != p->p_ucred) { oldcred = td->td_ucred; td->td_ucred = crhold(p->p_ucred); } if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); SDT_PROBE0(proc, , , lwp__exit); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); VM_CNT_INC(v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); td->td_state = TDS_INACTIVE; #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) wakeup_swapper |= thread_unsuspend_one(td2, p, true); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, EINTR); break; case SINGLE_BOUNDARY: case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); } } break; } return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); #endif } thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * It is safe to access p->p_singlethread unlocked * because it can only be set to our address by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND, NULL); thread_unlock(td); PROC_LOCK(p); } return (0); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td)); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } struct thread * thread_find(struct proc *p, lwpid_t tid) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } return (td); } /* Locate a thread by number; return with proc lock held. */ struct thread * tdfind(lwpid_t tid, pid_t pid) { #define RUN_THRESH 16 struct thread *td; int run = 0; rw_rlock(&tidhash_lock); LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) { td = NULL; break; } PROC_LOCK(td->td_proc); if (td->td_proc->p_state == PRS_NEW) { PROC_UNLOCK(td->td_proc); td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(&tidhash_lock)) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); return (td); } } break; } run++; } rw_runlock(&tidhash_lock); return (td); } void tidhash_add(struct thread *td) { rw_wlock(&tidhash_lock); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); } void tidhash_remove(struct thread *td) { rw_wlock(&tidhash_lock); LIST_REMOVE(td, td_hash); rw_wunlock(&tidhash_lock); } Index: head/sys/kern/kern_umtx.c =================================================================== --- head/sys/kern/kern_umtx.c (revision 326270) +++ head/sys/kern/kern_umtx.c (revision 326271) @@ -1,4574 +1,4576 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015, 2016 The FreeBSD Foundation * Copyright (c) 2004, David Xu * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_umtx_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #endif #define _UMUTEX_TRY 1 #define _UMUTEX_WAIT 2 #ifdef UMTX_PROFILING #define UPROF_PERC_BIGGER(w, f, sw, sf) \ (((w) > (sw)) || ((w) == (sw) && (f) > (sf))) #endif /* Priority inheritance mutex info. */ struct umtx_pi { /* Owner thread */ struct thread *pi_owner; /* Reference count */ int pi_refcount; /* List entry to link umtx holding by thread */ TAILQ_ENTRY(umtx_pi) pi_link; /* List entry in hash */ TAILQ_ENTRY(umtx_pi) pi_hashlink; /* List for waiters */ TAILQ_HEAD(,umtx_q) pi_blocked; /* Identify a userland lock object */ struct umtx_key pi_key; }; /* A userland synchronous object user. */ struct umtx_q { /* Linked list for the hash. */ TAILQ_ENTRY(umtx_q) uq_link; /* Umtx key. */ struct umtx_key uq_key; /* Umtx flags. */ int uq_flags; #define UQF_UMTXQ 0x0001 /* The thread waits on. */ struct thread *uq_thread; /* * Blocked on PI mutex. read can use chain lock * or umtx_lock, write must have both chain lock and * umtx_lock being hold. */ struct umtx_pi *uq_pi_blocked; /* On blocked list */ TAILQ_ENTRY(umtx_q) uq_lockq; /* Thread contending with us */ TAILQ_HEAD(,umtx_pi) uq_pi_contested; /* Inherited priority from PP mutex */ u_char uq_inherited_pri; /* Spare queue ready to be reused */ struct umtxq_queue *uq_spare_queue; /* The queue we on */ struct umtxq_queue *uq_cur_queue; }; TAILQ_HEAD(umtxq_head, umtx_q); /* Per-key wait-queue */ struct umtxq_queue { struct umtxq_head head; struct umtx_key key; LIST_ENTRY(umtxq_queue) link; int length; }; LIST_HEAD(umtxq_list, umtxq_queue); /* Userland lock object's wait-queue chain */ struct umtxq_chain { /* Lock for this chain. */ struct mtx uc_lock; /* List of sleep queues. */ struct umtxq_list uc_queue[2]; #define UMTX_SHARED_QUEUE 0 #define UMTX_EXCLUSIVE_QUEUE 1 LIST_HEAD(, umtxq_queue) uc_spare_queue; /* Busy flag */ char uc_busy; /* Chain lock waiters */ int uc_waiters; /* All PI in the list */ TAILQ_HEAD(,umtx_pi) uc_pi_list; #ifdef UMTX_PROFILING u_int length; u_int max_length; #endif }; #define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED) /* * Don't propagate time-sharing priority, there is a security reason, * a user can simply introduce PI-mutex, let thread A lock the mutex, * and let another thread B block on the mutex, because B is * sleeping, its priority will be boosted, this causes A's priority to * be boosted via priority propagating too and will never be lowered even * if it is using 100%CPU, this is unfair to other processes. */ #define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\ (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\ PRI_MAX_TIMESHARE : (td)->td_user_pri) #define GOLDEN_RATIO_PRIME 2654404609U #ifndef UMTX_CHAINS #define UMTX_CHAINS 512 #endif #define UMTX_SHIFTS (__WORD_BIT - 9) #define GET_SHARE(flags) \ (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE) #define BUSY_SPINS 200 struct abs_timeout { int clockid; bool is_abs_real; /* TIMER_ABSTIME && CLOCK_REALTIME* */ struct timespec cur; struct timespec end; }; #ifdef COMPAT_FREEBSD32 struct umutex32 { volatile __lwpid_t m_owner; /* Owner of the mutex */ __uint32_t m_flags; /* Flags of the mutex */ __uint32_t m_ceilings[2]; /* Priority protect ceiling */ __uint32_t m_rb_lnk; /* Robust linkage */ __uint32_t m_pad; __uint32_t m_spare[2]; }; _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32"); _Static_assert(__offsetof(struct umutex, m_spare[0]) == __offsetof(struct umutex32, m_spare[0]), "m_spare32"); #endif int umtx_shm_vnobj_persistent = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN, &umtx_shm_vnobj_persistent, 0, "False forces destruction of umtx attached to file, on last close"); static int umtx_max_rb = 1000; SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN, &umtx_max_rb, 0, ""); static uma_zone_t umtx_pi_zone; static struct umtxq_chain umtxq_chains[2][UMTX_CHAINS]; static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory"); static int umtx_pi_allocated; static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug"); SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD, &umtx_pi_allocated, 0, "Allocated umtx_pi"); static int umtx_verbose_rb = 1; SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN, &umtx_verbose_rb, 0, ""); #ifdef UMTX_PROFILING static long max_length; SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length"); static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats"); #endif static void abs_timeout_update(struct abs_timeout *timo); static void umtx_shm_init(void); static void umtxq_sysinit(void *); static void umtxq_hash(struct umtx_key *key); static struct umtxq_chain *umtxq_getchain(struct umtx_key *key); static void umtxq_lock(struct umtx_key *key); static void umtxq_unlock(struct umtx_key *key); static void umtxq_busy(struct umtx_key *key); static void umtxq_unbusy(struct umtx_key *key); static void umtxq_insert_queue(struct umtx_q *uq, int q); static void umtxq_remove_queue(struct umtx_q *uq, int q); static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *); static int umtxq_count(struct umtx_key *key); static struct umtx_pi *umtx_pi_alloc(int); static void umtx_pi_free(struct umtx_pi *pi); static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb); static void umtx_thread_cleanup(struct thread *td); static void umtx_exec_hook(void *arg __unused, struct proc *p __unused, struct image_params *imgp __unused); SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL); #define umtxq_signal(key, nwake) umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE) #define umtxq_insert(uq) umtxq_insert_queue((uq), UMTX_SHARED_QUEUE) #define umtxq_remove(uq) umtxq_remove_queue((uq), UMTX_SHARED_QUEUE) static struct mtx umtx_lock; #ifdef UMTX_PROFILING static void umtx_init_profiling(void) { struct sysctl_oid *chain_oid; char chain_name[10]; int i; for (i = 0; i < UMTX_CHAINS; ++i) { snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, chain_name, CTLFLAG_RD, NULL, "umtx hash stats"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL); } } static int sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS) { char buf[512]; struct sbuf sb; struct umtxq_chain *uc; u_int fract, i, j, tot, whole; u_int sf0, sf1, sf2, sf3, sf4; u_int si0, si1, si2, si3, si4; u_int sw0, sw1, sw2, sw3, sw4; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < 2; i++) { tot = 0; for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); tot += uc->max_length; mtx_unlock(&uc->uc_lock); } if (tot == 0) sbuf_printf(&sb, "%u) Empty ", i); else { sf0 = sf1 = sf2 = sf3 = sf4 = 0; si0 = si1 = si2 = si3 = si4 = 0; sw0 = sw1 = sw2 = sw3 = sw4 = 0; for (j = 0; j < UMTX_CHAINS; j++) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); whole = uc->max_length * 100; mtx_unlock(&uc->uc_lock); fract = (whole % tot) * 100; if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) { sf0 = fract; si0 = j; sw0 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw1, sf1)) { sf1 = fract; si1 = j; sw1 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw2, sf2)) { sf2 = fract; si2 = j; sw2 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw3, sf3)) { sf3 = fract; si3 = j; sw3 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw4, sf4)) { sf4 = fract; si4 = j; sw4 = whole; } } sbuf_printf(&sb, "queue %u:\n", i); sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot, sf0 / tot, si0); sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot, sf1 / tot, si1); sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot, sf2 / tot, si2); sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot, sf3 / tot, si3); sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot, sf4 / tot, si4); } } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } static int sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS) { struct umtxq_chain *uc; u_int i, j; int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (clear != 0) { for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); uc->length = 0; uc->max_length = 0; mtx_unlock(&uc->uc_lock); } } } return (0); } SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics"); SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length"); #endif static void umtxq_sysinit(void *arg __unused) { int i, j; umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL, MTX_DEF | MTX_DUPOK); LIST_INIT(&umtxq_chains[i][j].uc_queue[0]); LIST_INIT(&umtxq_chains[i][j].uc_queue[1]); LIST_INIT(&umtxq_chains[i][j].uc_spare_queue); TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list); umtxq_chains[i][j].uc_busy = 0; umtxq_chains[i][j].uc_waiters = 0; #ifdef UMTX_PROFILING umtxq_chains[i][j].length = 0; umtxq_chains[i][j].max_length = 0; #endif } } #ifdef UMTX_PROFILING umtx_init_profiling(); #endif mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF); EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL, EVENTHANDLER_PRI_ANY); umtx_shm_init(); } struct umtx_q * umtxq_alloc(void) { struct umtx_q *uq; uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO); uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO); TAILQ_INIT(&uq->uq_spare_queue->head); TAILQ_INIT(&uq->uq_pi_contested); uq->uq_inherited_pri = PRI_MAX; return (uq); } void umtxq_free(struct umtx_q *uq) { MPASS(uq->uq_spare_queue != NULL); free(uq->uq_spare_queue, M_UMTX); free(uq, M_UMTX); } static inline void umtxq_hash(struct umtx_key *key) { unsigned n; n = (uintptr_t)key->info.both.a + key->info.both.b; key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS; } static inline struct umtxq_chain * umtxq_getchain(struct umtx_key *key) { if (key->type <= TYPE_SEM) return (&umtxq_chains[1][key->hash]); return (&umtxq_chains[0][key->hash]); } /* * Lock a chain. */ static inline void umtxq_lock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_lock(&uc->uc_lock); } /* * Unlock a chain. */ static inline void umtxq_unlock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_unlock(&uc->uc_lock); } /* * Set chain to busy state when following operation * may be blocked (kernel mutex can not be used). */ static inline void umtxq_busy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); if (uc->uc_busy) { #ifdef SMP if (smp_cpus > 1) { int count = BUSY_SPINS; if (count > 0) { umtxq_unlock(key); while (uc->uc_busy && --count > 0) cpu_spinwait(); umtxq_lock(key); } } #endif while (uc->uc_busy) { uc->uc_waiters++; msleep(uc, &uc->uc_lock, 0, "umtxqb", 0); uc->uc_waiters--; } } uc->uc_busy = 1; } /* * Unbusy a chain. */ static inline void umtxq_unbusy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); KASSERT(uc->uc_busy != 0, ("not busy")); uc->uc_busy = 0; if (uc->uc_waiters) wakeup_one(uc); } static inline void umtxq_unbusy_unlocked(struct umtx_key *key) { umtxq_lock(key); umtxq_unbusy(key); umtxq_unlock(key); } static struct umtxq_queue * umtxq_queue_lookup(struct umtx_key *key, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); LIST_FOREACH(uh, &uc->uc_queue[q], link) { if (umtx_key_match(&uh->key, key)) return (uh); } return (NULL); } static inline void umtxq_insert_queue(struct umtx_q *uq, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue")); uh = umtxq_queue_lookup(&uq->uq_key, q); if (uh != NULL) { LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link); } else { uh = uq->uq_spare_queue; uh->key = uq->uq_key; LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link); #ifdef UMTX_PROFILING uc->length++; if (uc->length > uc->max_length) { uc->max_length = uc->length; if (uc->max_length > max_length) max_length = uc->max_length; } #endif } uq->uq_spare_queue = NULL; TAILQ_INSERT_TAIL(&uh->head, uq, uq_link); uh->length++; uq->uq_flags |= UQF_UMTXQ; uq->uq_cur_queue = uh; return; } static inline void umtxq_remove_queue(struct umtx_q *uq, int q) { struct umtxq_chain *uc; struct umtxq_queue *uh; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); if (uq->uq_flags & UQF_UMTXQ) { uh = uq->uq_cur_queue; TAILQ_REMOVE(&uh->head, uq, uq_link); uh->length--; uq->uq_flags &= ~UQF_UMTXQ; if (TAILQ_EMPTY(&uh->head)) { KASSERT(uh->length == 0, ("inconsistent umtxq_queue length")); #ifdef UMTX_PROFILING uc->length--; #endif LIST_REMOVE(uh, link); } else { uh = LIST_FIRST(&uc->uc_spare_queue); KASSERT(uh != NULL, ("uc_spare_queue is empty")); LIST_REMOVE(uh, link); } uq->uq_spare_queue = uh; uq->uq_cur_queue = NULL; } } /* * Check if there are multiple waiters */ static int umtxq_count(struct umtx_key *key) { struct umtxq_chain *uc; struct umtxq_queue *uh; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) return (uh->length); return (0); } /* * Check if there are multiple PI waiters and returns first * waiter. */ static int umtxq_count_pi(struct umtx_key *key, struct umtx_q **first) { struct umtxq_chain *uc; struct umtxq_queue *uh; *first = NULL; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) { *first = TAILQ_FIRST(&uh->head); return (uh->length); } return (0); } static int umtxq_check_susp(struct thread *td) { struct proc *p; int error; /* * The check for TDF_NEEDSUSPCHK is racy, but it is enough to * eventually break the lockstep loop. */ if ((td->td_flags & TDF_NEEDSUSPCHK) == 0) return (0); error = 0; p = td->td_proc; PROC_LOCK(p); if (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) { if (p->p_flag & P_SINGLE_EXIT) error = EINTR; else error = ERESTART; } PROC_UNLOCK(p); return (error); } /* * Wake up threads waiting on an userland object. */ static int umtxq_signal_queue(struct umtx_key *key, int n_wake, int q) { struct umtxq_chain *uc; struct umtxq_queue *uh; struct umtx_q *uq; int ret; ret = 0; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, q); if (uh != NULL) { while ((uq = TAILQ_FIRST(&uh->head)) != NULL) { umtxq_remove_queue(uq, q); wakeup(uq); if (++ret >= n_wake) return (ret); } } return (ret); } /* * Wake up specified thread. */ static inline void umtxq_signal_thread(struct umtx_q *uq) { struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); umtxq_remove(uq); wakeup(uq); } static inline int tstohz(const struct timespec *tsp) { struct timeval tv; TIMESPEC_TO_TIMEVAL(&tv, tsp); return tvtohz(&tv); } static void abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute, const struct timespec *timeout) { timo->clockid = clockid; if (!absolute) { timo->is_abs_real = false; abs_timeout_update(timo); timo->end = timo->cur; timespecadd(&timo->end, timeout); } else { timo->end = *timeout; timo->is_abs_real = clockid == CLOCK_REALTIME || clockid == CLOCK_REALTIME_FAST || clockid == CLOCK_REALTIME_PRECISE; /* * If is_abs_real, umtxq_sleep will read the clock * after setting td_rtcgen; otherwise, read it here. */ if (!timo->is_abs_real) { abs_timeout_update(timo); } } } static void abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime) { abs_timeout_init(timo, umtxtime->_clockid, (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout); } static inline void abs_timeout_update(struct abs_timeout *timo) { kern_clock_gettime(curthread, timo->clockid, &timo->cur); } static int abs_timeout_gethz(struct abs_timeout *timo) { struct timespec tts; if (timespeccmp(&timo->end, &timo->cur, <=)) return (-1); tts = timo->end; timespecsub(&tts, &timo->cur); return (tstohz(&tts)); } static uint32_t umtx_unlock_val(uint32_t flags, bool rb) { if (rb) return (UMUTEX_RB_OWNERDEAD); else if ((flags & UMUTEX_NONCONSISTENT) != 0) return (UMUTEX_RB_NOTRECOV); else return (UMUTEX_UNOWNED); } /* * Put thread into sleep state, before sleeping, check if * thread was removed from umtx queue. */ static inline int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime) { struct umtxq_chain *uc; int error, timo; if (abstime != NULL && abstime->is_abs_real) { curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation); abs_timeout_update(abstime); } uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); for (;;) { if (!(uq->uq_flags & UQF_UMTXQ)) { error = 0; break; } if (abstime != NULL) { timo = abs_timeout_gethz(abstime); if (timo < 0) { error = ETIMEDOUT; break; } } else timo = 0; error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo); if (error == EINTR || error == ERESTART) { umtxq_lock(&uq->uq_key); break; } if (abstime != NULL) { if (abstime->is_abs_real) curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation); abs_timeout_update(abstime); } umtxq_lock(&uq->uq_key); } curthread->td_rtcgen = 0; return (error); } /* * Convert userspace address into unique logical address. */ int umtx_key_get(const void *addr, int type, int share, struct umtx_key *key) { struct thread *td = curthread; vm_map_t map; vm_map_entry_t entry; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; key->type = type; if (share == THREAD_SHARE) { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } else { MPASS(share == PROCESS_SHARE || share == AUTO_SHARE); map = &td->td_proc->p_vmspace->vm_map; if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE, &entry, &key->info.shared.object, &pindex, &prot, &wired) != KERN_SUCCESS) { return (EFAULT); } if ((share == PROCESS_SHARE) || (share == AUTO_SHARE && VM_INHERIT_SHARE == entry->inheritance)) { key->shared = 1; key->info.shared.offset = (vm_offset_t)addr - entry->start + entry->offset; vm_object_reference(key->info.shared.object); } else { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } vm_map_lookup_done(map, entry); } umtxq_hash(key); return (0); } /* * Release key. */ void umtx_key_release(struct umtx_key *key) { if (key->shared) vm_object_deallocate(key->info.shared.object); } /* * Fetch and compare value, sleep on the address if value is not changed. */ static int do_wait(struct thread *td, void *addr, u_long id, struct _umtx_time *timeout, int compat32, int is_private) { struct abs_timeout timo; struct umtx_q *uq; u_long tmp; uint32_t tmp32; int error = 0; uq = td->td_umtxq; if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); if (compat32 == 0) { error = fueword(addr, &tmp); if (error != 0) error = EFAULT; } else { error = fueword32(addr, &tmp32); if (error == 0) tmp = tmp32; else error = EFAULT; } umtxq_lock(&uq->uq_key); if (error == 0) { if (tmp == id) error = umtxq_sleep(uq, "uwait", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else umtxq_remove(uq); } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { umtxq_remove(uq); } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } /* * Wake up threads sleeping on the specified address. */ int kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private) { struct umtx_key key; int ret; if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0) return (ret); umtxq_lock(&key); umtxq_signal(&key, n_wake); umtxq_unlock(&key); umtx_key_release(&key); return (0); } /* * Lock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int mode) { struct abs_timeout timo; struct umtx_q *uq; uint32_t owner, old, id; int error, rv; id = td->td_tid; uq = td->td_umtxq; error = 0; if (timeout != NULL) abs_timeout_init2(&timo, timeout); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { rv = fueword32(&m->m_owner, &owner); if (rv == -1) return (EFAULT); if (mode == _UMUTEX_WAIT) { if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV) return (0); } else { /* * Robust mutex terminated. Kernel duty is to * return EOWNERDEAD to the userspace. The * umutex.m_flags UMUTEX_NONCONSISTENT is set * by the common userspace code. */ if (owner == UMUTEX_RB_OWNERDEAD) { rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD, &owner, id | UMUTEX_CONTESTED); if (rv == -1) return (EFAULT); if (owner == UMUTEX_RB_OWNERDEAD) return (EOWNERDEAD); /* success */ rv = umtxq_check_susp(td); if (rv != 0) return (rv); continue; } if (owner == UMUTEX_RB_NOTRECOV) return (ENOTRECOVERABLE); /* * Try the uncontested case. This should be * done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) return (EFAULT); /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) return (0); /* * If no one owns it but it is contested try * to acquire it. */ if (owner == UMUTEX_CONTESTED) { rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) return (EFAULT); if (owner == UMUTEX_CONTESTED) return (0); rv = umtxq_check_susp(td); if (rv != 0) return (rv); /* * If this failed the lock has * changed, restart. */ continue; } } if (mode == _UMUTEX_TRY) return (EBUSY); /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtxn", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0) error = umtxq_check_susp(td); } return (0); } /* * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; uint32_t owner, old, id, newlock; int error, count; id = td->td_tid; /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); newlock = umtx_unlock_val(flags, rb); if ((owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, newlock); if (error == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ if (count > 1) newlock |= UMUTEX_CONTESTED; error = casueword32(&m->m_owner, owner, &old, newlock); umtxq_lock(&key); umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Check if the mutex is available and wake up a waiter, * only for simple mutex. */ static int do_wake_umutex(struct thread *td, struct umutex *m) { struct umtx_key key; uint32_t owner; uint32_t flags; int error; int count; error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD && owner != UMUTEX_RB_NOTRECOV) return (0); error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD && owner != UMUTEX_RB_NOTRECOV) { error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, UMUTEX_UNOWNED); if (error == -1) error = EFAULT; } umtxq_lock(&key); if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV)) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } /* * Check if the mutex has waiters and tries to fix contention bit. */ static int do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; uint32_t owner, old; int type; int error; int count; switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST)) { case 0: case UMUTEX_ROBUST: type = TYPE_NORMAL_UMUTEX; break; case UMUTEX_PRIO_INHERIT: type = TYPE_PI_UMUTEX; break; case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST): type = TYPE_PI_ROBUST_UMUTEX; break; case UMUTEX_PRIO_PROTECT: type = TYPE_PP_UMUTEX; break; case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST): type = TYPE_PP_ROBUST_UMUTEX; break; default: return (EINVAL); } if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0) return (error); owner = 0; umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * Only repair contention bit if there is a waiter, this means the mutex * is still being referenced by userland code, otherwise don't update * any memory. */ if (count > 1) { error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (old == owner) break; owner = old; error = umtxq_check_susp(td); if (error != 0) break; } } else if (count == 1) { error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 && (owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (old == owner) break; owner = old; error = umtxq_check_susp(td); if (error != 0) break; } } umtxq_lock(&key); if (error == EFAULT) { umtxq_signal(&key, INT_MAX); } else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV)) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static inline struct umtx_pi * umtx_pi_alloc(int flags) { struct umtx_pi *pi; pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags); TAILQ_INIT(&pi->pi_blocked); atomic_add_int(&umtx_pi_allocated, 1); return (pi); } static inline void umtx_pi_free(struct umtx_pi *pi) { uma_zfree(umtx_pi_zone, pi); atomic_add_int(&umtx_pi_allocated, -1); } /* * Adjust the thread's position on a pi_state after its priority has been * changed. */ static int umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td) { struct umtx_q *uq, *uq1, *uq2; struct thread *td1; mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (0); uq = td->td_umtxq; /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq); uq2 = TAILQ_NEXT(uq, uq_lockq); if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) || (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { td1 = uq1->uq_thread; MPASS(td1->td_proc->p_magic == P_MAGIC); if (UPRI(td1) > UPRI(td)) break; } if (uq1 == NULL) TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); else TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); } return (1); } static struct umtx_pi * umtx_pi_next(struct umtx_pi *pi) { struct umtx_q *uq_owner; if (pi->pi_owner == NULL) return (NULL); uq_owner = pi->pi_owner->td_umtxq; if (uq_owner == NULL) return (NULL); return (uq_owner->uq_pi_blocked); } /* * Floyd's Cycle-Finding Algorithm. */ static bool umtx_pi_check_loop(struct umtx_pi *pi) { struct umtx_pi *pi1; /* fast iterator */ mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (false); pi1 = pi; for (;;) { pi = umtx_pi_next(pi); if (pi == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; if (pi == pi1) return (true); } return (false); } /* * Propagate priority when a thread is blocked on POSIX * PI mutex. */ static void umtx_propagate_priority(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; int pri; mtx_assert(&umtx_lock, MA_OWNED); pri = UPRI(td); uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) return; if (umtx_pi_check_loop(pi)) return; for (;;) { td = pi->pi_owner; if (td == NULL || td == curthread) return; MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); thread_lock(td); if (td->td_lend_user_pri > pri) sched_lend_user_prio(td, pri); else { thread_unlock(td); break; } thread_unlock(td); /* * Pick up the lock that td is blocked on. */ uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) break; /* Resort td on the list if needed. */ umtx_pi_adjust_thread(pi, td); } } /* * Unpropagate priority for a PI mutex when a thread blocked on * it is interrupted by signal or resumed by others. */ static void umtx_repropagate_priority(struct umtx_pi *pi) { struct umtx_q *uq, *uq_owner; struct umtx_pi *pi2; int pri; mtx_assert(&umtx_lock, MA_OWNED); if (umtx_pi_check_loop(pi)) return; while (pi != NULL && pi->pi_owner != NULL) { pri = PRI_MAX; uq_owner = pi->pi_owner->td_umtxq; TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) { uq = TAILQ_FIRST(&pi2->pi_blocked); if (uq != NULL) { if (pri > UPRI(uq->uq_thread)) pri = UPRI(uq->uq_thread); } } if (pri > uq_owner->uq_inherited_pri) pri = uq_owner->uq_inherited_pri; thread_lock(pi->pi_owner); sched_lend_user_prio(pi->pi_owner, pri); thread_unlock(pi->pi_owner); if ((pi = uq_owner->uq_pi_blocked) != NULL) umtx_pi_adjust_thread(pi, uq_owner->uq_thread); } } /* * Insert a PI mutex into owned list. */ static void umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq_owner; uq_owner = owner->td_umtxq; mtx_assert(&umtx_lock, MA_OWNED); MPASS(pi->pi_owner == NULL); pi->pi_owner = owner; TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link); } /* * Disown a PI mutex, and remove it from the owned list. */ static void umtx_pi_disown(struct umtx_pi *pi) { mtx_assert(&umtx_lock, MA_OWNED); TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link); pi->pi_owner = NULL; } /* * Claim ownership of a PI mutex. */ static int umtx_pi_claim(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq; int pri; mtx_lock(&umtx_lock); if (pi->pi_owner == owner) { mtx_unlock(&umtx_lock); return (0); } if (pi->pi_owner != NULL) { /* * userland may have already messed the mutex, sigh. */ mtx_unlock(&umtx_lock); return (EPERM); } umtx_pi_setowner(pi, owner); uq = TAILQ_FIRST(&pi->pi_blocked); if (uq != NULL) { pri = UPRI(uq->uq_thread); thread_lock(owner); if (pri < UPRI(owner)) sched_lend_user_prio(owner, pri); thread_unlock(owner); } mtx_unlock(&umtx_lock); return (0); } /* * Adjust a thread's order position in its blocked PI mutex, * this may result new priority propagating process. */ void umtx_pi_adjust(struct thread *td, u_char oldpri) { struct umtx_q *uq; struct umtx_pi *pi; uq = td->td_umtxq; mtx_lock(&umtx_lock); /* * Pick up the lock that td is blocked on. */ pi = uq->uq_pi_blocked; if (pi != NULL) { umtx_pi_adjust_thread(pi, td); umtx_repropagate_priority(pi); } mtx_unlock(&umtx_lock); } /* * Sleep on a PI mutex. */ static int umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner, const char *wmesg, struct abs_timeout *timo, bool shared) { struct umtxq_chain *uc; struct thread *td, *td1; struct umtx_q *uq1; int error, pri; error = 0; td = uq->uq_thread; KASSERT(td == curthread, ("inconsistent uq_thread")); uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(uc->uc_busy != 0, ("umtx chain is not busy")); umtxq_insert(uq); mtx_lock(&umtx_lock); if (pi->pi_owner == NULL) { mtx_unlock(&umtx_lock); td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid); mtx_lock(&umtx_lock); if (td1 != NULL) { if (pi->pi_owner == NULL) umtx_pi_setowner(pi, td1); PROC_UNLOCK(td1->td_proc); } } TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { pri = UPRI(uq1->uq_thread); if (pri > UPRI(td)) break; } if (uq1 != NULL) TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); else TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); uq->uq_pi_blocked = pi; thread_lock(td); td->td_flags |= TDF_UPIBLOCKED; thread_unlock(td); umtx_propagate_priority(td); mtx_unlock(&umtx_lock); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, wmesg, timo); umtxq_remove(uq); mtx_lock(&umtx_lock); uq->uq_pi_blocked = NULL; thread_lock(td); td->td_flags &= ~TDF_UPIBLOCKED; thread_unlock(td); TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); umtx_repropagate_priority(pi); mtx_unlock(&umtx_lock); umtxq_unlock(&uq->uq_key); return (error); } /* * Add reference count for a PI mutex. */ static void umtx_pi_ref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); pi->pi_refcount++; } /* * Decrease reference count for a PI mutex, if the counter * is decreased to zero, its memory space is freed. */ static void umtx_pi_unref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(pi->pi_refcount > 0, ("invalid reference count")); if (--pi->pi_refcount == 0) { mtx_lock(&umtx_lock); if (pi->pi_owner != NULL) umtx_pi_disown(pi); KASSERT(TAILQ_EMPTY(&pi->pi_blocked), ("blocked queue not empty")); mtx_unlock(&umtx_lock); TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink); umtx_pi_free(pi); } } /* * Find a PI mutex in hash table. */ static struct umtx_pi * umtx_pi_lookup(struct umtx_key *key) { struct umtxq_chain *uc; struct umtx_pi *pi; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) { if (umtx_key_match(&pi->pi_key, key)) { return (pi); } } return (NULL); } /* * Insert a PI mutex into hash table. */ static inline void umtx_pi_insert(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink); } /* * Lock a PI mutex. */ static int do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct abs_timeout timo; struct umtx_q *uq; struct umtx_pi *pi, *new_pi; uint32_t id, old_owner, owner, old; int error, rv; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi == NULL) { new_pi = umtx_pi_alloc(M_NOWAIT); if (new_pi == NULL) { umtxq_unlock(&uq->uq_key); new_pi = umtx_pi_alloc(M_WAITOK); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi != NULL) { umtx_pi_free(new_pi); new_pi = NULL; } } if (new_pi != NULL) { new_pi->pi_key = uq->uq_key; umtx_pi_insert(new_pi); pi = new_pi; } } umtx_pi_ref(pi); umtxq_unlock(&uq->uq_key); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) { error = 0; break; } if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; break; } /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) { old_owner = owner; rv = casueword32(&m->m_owner, owner, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (owner == old_owner) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); error = umtx_pi_claim(pi, td); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); if (error != 0) { /* * Since we're going to return an * error, restore the m_owner to its * previous, unowned state to avoid * compounding the problem. */ (void)casuword32(&m->m_owner, id | UMUTEX_CONTESTED, old_owner); } if (error == 0 && old_owner == UMUTEX_RB_OWNERDEAD) error = EOWNERDEAD; break; } error = umtxq_check_susp(td); if (error != 0) break; /* If this failed the lock has changed, restart. */ continue; } if ((owner & ~UMUTEX_CONTESTED) == id) { error = EDEADLK; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } umtxq_lock(&uq->uq_key); /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. Note that the UMUTEX_RB_OWNERDEAD * value for owner is impossible there. */ if (old == owner) { error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED, "umtxpi", timeout == NULL ? NULL : &timo, (flags & USYNC_PROCESS_SHARED) != 0); if (error != 0) continue; } else { umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } error = umtxq_check_susp(td); if (error != 0) break; } umtxq_lock(&uq->uq_key); umtx_pi_unref(pi); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PI mutex. */ static int do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; struct umtx_q *uq_first, *uq_first2, *uq_me; struct umtx_pi *pi, *pi2; uint32_t id, new_owner, old, owner; int count, error, pri; id = td->td_tid; /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); new_owner = umtx_unlock_val(flags, rb); /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, new_owner); if (error == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count_pi(&key, &uq_first); if (uq_first != NULL) { mtx_lock(&umtx_lock); pi = uq_first->uq_pi_blocked; KASSERT(pi != NULL, ("pi == NULL?")); if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) { mtx_unlock(&umtx_lock); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); /* userland messed the mutex */ return (EPERM); } uq_me = td->td_umtxq; if (pi->pi_owner == td) umtx_pi_disown(pi); /* get highest priority thread which is still sleeping. */ uq_first = TAILQ_FIRST(&pi->pi_blocked); while (uq_first != NULL && (uq_first->uq_flags & UQF_UMTXQ) == 0) { uq_first = TAILQ_NEXT(uq_first, uq_lockq); } pri = PRI_MAX; TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) { uq_first2 = TAILQ_FIRST(&pi2->pi_blocked); if (uq_first2 != NULL) { if (pri > UPRI(uq_first2->uq_thread)) pri = UPRI(uq_first2->uq_thread); } } thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); if (uq_first) umtxq_signal_thread(uq_first); } else { pi = umtx_pi_lookup(&key); /* * A umtx_pi can exist if a signal or timeout removed the * last waiter from the umtxq, but there is still * a thread in do_lock_pi() holding the umtx_pi. */ if (pi != NULL) { /* * The umtx_pi can be unowned, such as when a thread * has just entered do_lock_pi(), allocated the * umtx_pi, and unlocked the umtxq. * If the current thread owns it, it must disown it. */ mtx_lock(&umtx_lock); if (pi->pi_owner == td) umtx_pi_disown(pi); mtx_unlock(&umtx_lock); } } umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ if (count > 1) new_owner |= UMUTEX_CONTESTED; error = casueword32(&m->m_owner, owner, &old, new_owner); umtxq_unbusy_unlocked(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Lock a PP mutex. */ static int do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct abs_timeout timo; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t ceiling; uint32_t owner, id; int error, pri, old_inherited_pri, su, rv; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); for (;;) { old_inherited_pri = uq->uq_inherited_pri; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &ceiling); if (rv == -1) { error = EFAULT; goto out; } ceiling = RTP_PRIO_MAX - ceiling; if (ceiling > RTP_PRIO_MAX) { error = EINVAL; goto out; } mtx_lock(&umtx_lock); if (UPRI(td) < PRI_MIN_REALTIME + ceiling) { mtx_unlock(&umtx_lock); error = EINVAL; goto out; } if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) { uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling; thread_lock(td); if (uq->uq_inherited_pri < UPRI(td)) sched_lend_user_prio(td, uq->uq_inherited_pri); thread_unlock(td); } mtx_unlock(&umtx_lock); rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { error = 0; break; } else if (owner == UMUTEX_RB_OWNERDEAD) { rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD, &owner, id | UMUTEX_CONTESTED); if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_RB_OWNERDEAD) { error = EOWNERDEAD; /* success */ break; } error = 0; } else if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } if (error != 0 && error != EOWNERDEAD) { mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } out: umtxq_unbusy_unlocked(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PP mutex. */ static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t id, owner, rceiling; int error, pri, new_inherited_pri, su; id = td->td_tid; uq = td->td_umtxq; su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t)); if (error != 0) return (error); if (rceiling == -1) new_inherited_pri = PRI_MAX; else { rceiling = RTP_PRIO_MAX - rceiling; if (rceiling > RTP_PRIO_MAX) return (EINVAL); new_inherited_pri = PRI_MIN_REALTIME + rceiling; } if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_unlock(&key); /* * For priority protected mutex, always set unlocked state * to UMUTEX_CONTESTED, so that userland always enters kernel * to lock the mutex, it is necessary because thread priority * has to be adjusted for such mutex. */ error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) | UMUTEX_CONTESTED); umtxq_lock(&key); if (error == 0) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); if (error == -1) error = EFAULT; else { mtx_lock(&umtx_lock); if (su != 0) uq->uq_inherited_pri = new_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } umtx_key_release(&key); return (error); } static int do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling, uint32_t *old_ceiling) { struct umtx_q *uq; uint32_t flags, id, owner, save_ceiling; int error, rv, rv1; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); if ((flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); if (ceiling > RTP_PRIO_MAX) return (EINVAL); id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); for (;;) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &save_ceiling); if (rv == -1) { error = EFAULT; break; } rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { rv = suword32(&m->m_ceilings[0], ceiling); rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED); error = (rv == 0 && rv1 == 0) ? 0: EFAULT; break; } if ((owner & ~UMUTEX_CONTESTED) == id) { rv = suword32(&m->m_ceilings[0], ceiling); error = rv == 0 ? 0 : EFAULT; break; } if (owner == UMUTEX_RB_OWNERDEAD) { error = EOWNERDEAD; break; } else if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", NULL); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } umtxq_lock(&uq->uq_key); if (error == 0) umtxq_signal(&uq->uq_key, INT_MAX); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0 && old_ceiling != NULL) { rv = suword32(old_ceiling, save_ceiling); error = rv == 0 ? 0 : EFAULT; } return (error); } /* * Lock a userland POSIX mutex. */ static int do_lock_umutex(struct thread *td, struct umutex *m, struct _umtx_time *timeout, int mode) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: error = do_lock_normal(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_INHERIT: error = do_lock_pi(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_PROTECT: error = do_lock_pp(td, m, flags, timeout, mode); break; default: return (EINVAL); } if (timeout == NULL) { if (error == EINTR && mode != _UMUTEX_WAIT) error = ERESTART; } else { /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a userland POSIX mutex. */ static int do_unlock_umutex(struct thread *td, struct umutex *m, bool rb) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: return (do_unlock_normal(td, m, flags, rb)); case UMUTEX_PRIO_INHERIT: return (do_unlock_pi(td, m, flags, rb)); case UMUTEX_PRIO_PROTECT: return (do_unlock_pp(td, m, flags, rb)); } return (EINVAL); } static int do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m, struct timespec *timeout, u_long wflags) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, clockid, hasw; int error; uq = td->td_umtxq; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if ((wflags & CVWAIT_CLOCKID) != 0) { error = fueword32(&cv->c_clockid, &clockid); if (error == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (clockid < CLOCK_REALTIME || clockid >= CLOCK_THREAD_CPUTIME_ID) { /* hmm, only HW clock id will work. */ umtx_key_release(&uq->uq_key); return (EINVAL); } } else { clockid = CLOCK_REALTIME; } umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set c_has_waiters to 1 before releasing user mutex, also * don't modify cache line when unnecessary. */ error = fueword32(&cv->c_has_waiters, &hasw); if (error == 0 && hasw == 0) suword32(&cv->c_has_waiters, 1); umtxq_unbusy_unlocked(&uq->uq_key); error = do_unlock_umutex(td, m, false); if (timeout != NULL) abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0, timeout); umtxq_lock(&uq->uq_key); if (error == 0) { error = umtxq_sleep(uq, "ucond", timeout == NULL ? NULL : &timo); } if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { /* * This must be timeout,interrupted by signal or * surprious wakeup, clear c_has_waiter flag when * necessary. */ umtxq_busy(&uq->uq_key); if ((uq->uq_flags & UQF_UMTXQ) != 0) { int oldlen = uq->uq_cur_queue->length; umtxq_remove(uq); if (oldlen == 1) { umtxq_unlock(&uq->uq_key); suword32(&cv->c_has_waiters, 0); umtxq_lock(&uq->uq_key); } } umtxq_unbusy(&uq->uq_key); if (error == ERESTART) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland condition variable. */ static int do_cv_signal(struct thread *td, struct ucond *cv) { struct umtx_key key; int error, cnt, nwake; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); nwake = umtxq_signal(&key, 1); if (cnt <= nwake) { umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_lock(&key); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static int do_cv_broadcast(struct thread *td, struct ucond *cv) { struct umtx_key key; int error; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_signal(&key, INT_MAX); umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_unbusy_unlocked(&key); umtx_key_release(&key); return (error); } static int do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, wrflags; int32_t state, oldstate; int32_t blocked_readers; int error, error1, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); wrflags = URWLOCK_WRITE_OWNER; if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER)) wrflags |= URWLOCK_WRITE_WAITERS; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } /* try to lock it */ while (!(state & wrflags)) { if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) { umtx_key_release(&uq->uq_key); return (EAGAIN); } rv = casueword32(&rwlock->rw_state, state, &oldstate, state + 1); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (oldstate == state) { umtx_key_release(&uq->uq_key); return (0); } error = umtxq_check_susp(td); if (error != 0) break; state = oldstate; } if (error) break; /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; /* set read contention bit */ while (error == 0 && (state & wrflags) && !(state & URWLOCK_READ_WAITERS)) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) goto sleep; state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } /* state is changed while setting flags, restart */ if (!(state & wrflags)) { umtxq_unbusy_unlocked(&uq->uq_key); error = umtxq_check_susp(td); if (error != 0) break; continue; } sleep: /* contention bit is set, before sleeping, increase read waiter count */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers+1); while (state & wrflags) { umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "urdlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } /* decrease read waiter count, and may clear read contention bit */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers-1); if (blocked_readers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) break; state = oldstate; error1 = umtxq_check_susp(td); if (error1 != 0) { if (error == 0) error = error1; break; } } } umtxq_unbusy_unlocked(&uq->uq_key); if (error != 0) break; } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int32_t blocked_writers; int32_t blocked_readers; int error, error1, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); blocked_readers = 0; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_OWNER); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (oldstate == state) { umtx_key_release(&uq->uq_key); return (0); } state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error) { if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) && blocked_readers != 0) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } break; } /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; while (error == 0 && ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) && (state & URWLOCK_WRITE_WAITERS) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) goto sleep; state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { umtxq_unbusy_unlocked(&uq->uq_key); error = umtxq_check_susp(td); if (error != 0) break; continue; } sleep: rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers+1); while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) { umtxq_lock(&uq->uq_key); umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "uwrlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers-1); if (blocked_writers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) break; state = oldstate; error1 = umtxq_check_susp(td); /* * We are leaving the URWLOCK_WRITE_WAITERS * behind, but this should not harm the * correctness. */ if (error1 != 0) { if (error == 0) error = error1; break; } } rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } } else blocked_readers = 0; umtxq_unbusy_unlocked(&uq->uq_key); } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_unlock(struct thread *td, struct urwlock *rwlock) { struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int error, rv, q, count; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); error = fueword32(&rwlock->rw_state, &state); if (error == -1) { error = EFAULT; goto out; } if (state & URWLOCK_WRITE_OWNER) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_OWNER); if (rv == -1) { error = EFAULT; goto out; } if (oldstate != state) { state = oldstate; if (!(oldstate & URWLOCK_WRITE_OWNER)) { error = EPERM; goto out; } error = umtxq_check_susp(td); if (error != 0) goto out; } else break; } } else if (URWLOCK_READER_COUNT(state) != 0) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state - 1); if (rv == -1) { error = EFAULT; goto out; } if (oldstate != state) { state = oldstate; if (URWLOCK_READER_COUNT(oldstate) == 0) { error = EPERM; goto out; } error = umtxq_check_susp(td); if (error != 0) goto out; } else break; } } else { error = EPERM; goto out; } count = 0; if (!(flags & URWLOCK_PREFER_READER)) { if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } else if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } } else { if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } else if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } } if (count) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, count, q); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } out: umtx_key_release(&uq->uq_key); return (error); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, count, count1; int error, rv; uq = td->td_umtxq; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = casueword32(&sem->_has_waiters, 0, &count1, 1); if (rv == 0) rv = fueword32(&sem->_count, &count); if (rv == -1 || count != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (rv == -1 ? EFAULT : 0); } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); /* A relative timeout cannot be restarted. */ if (error == ERESTART && timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem_wake(struct thread *td, struct _usem *sem) { struct umtx_key key; int error, cnt; uint32_t flags; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { /* * Check if count is greater than 0, this means the memory is * still being referenced by user code, so we can safely * update _has_waiters flag. */ if (cnt == 1) { umtxq_unlock(&key); error = suword32(&sem->_has_waiters, 0); umtxq_lock(&key); if (error == -1) error = EFAULT; } umtxq_signal(&key, 1); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } #endif static int do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t count, flags; int error, rv; uq = td->td_umtxq; flags = fuword32(&sem->_flags); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = fueword32(&sem->_count, &count); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } for (;;) { if (USEM_COUNT(count) != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (0); } if (count == USEM_HAS_WAITERS) break; rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } if (count == 0) break; } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) { /* A relative timeout cannot be restarted. */ if (error == ERESTART) error = EINTR; if (error == EINTR) { abs_timeout_update(&timo); timeout->_timeout = timo.end; timespecsub(&timeout->_timeout, &timo.cur); } } } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem2_wake(struct thread *td, struct _usem2 *sem) { struct umtx_key key; int error, cnt, rv; uint32_t count, flags; rv = fueword32(&sem->_flags, &flags); if (rv == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { /* * If this was the last sleeping thread, clear the waiters * flag in _count. */ if (cnt == 1) { umtxq_unlock(&key); rv = fueword32(&sem->_count, &count); while (rv != -1 && count & USEM_HAS_WAITERS) rv = casueword32(&sem->_count, count, &count, count & ~USEM_HAS_WAITERS); if (rv == -1) error = EFAULT; umtxq_lock(&key); } umtxq_signal(&key, 1); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } inline int umtx_copyin_timeout(const void *addr, struct timespec *tsp) { int error; error = copyin(addr, tsp, sizeof(struct timespec)); if (error == 0) { if (tsp->tv_sec < 0 || tsp->tv_nsec >= 1000000000 || tsp->tv_nsec < 0) error = EINVAL; } return (error); } static inline int umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp) { int error; if (size <= sizeof(struct timespec)) { tp->_clockid = CLOCK_REALTIME; tp->_flags = 0; error = copyin(addr, &tp->_timeout, sizeof(struct timespec)); } else error = copyin(addr, tp, sizeof(struct _umtx_time)); if (error != 0) return (error); if (tp->_timeout.tv_sec < 0 || tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0) return (EINVAL); return (0); } static int __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap) { return (EOPNOTSUPP); } static int __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0)); } static int __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0)); } static int __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1)); } static int __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap) { return (kern_umtx_wake(td, uap->obj, uap->val, 0)); } #define BATCH_SIZE 128 static int __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap) { char *uaddrs[BATCH_SIZE], **upp; int count, error, i, pos, tocopy; upp = (char **)uap->obj; error = 0; for (count = uap->val, pos = 0; count > 0; count -= tocopy, pos += tocopy) { tocopy = MIN(count, BATCH_SIZE); error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *)); if (error != 0) break; for (i = 0; i < tocopy; ++i) kern_umtx_wake(td, uaddrs[i], INT_MAX, 1); maybe_yield(); } return (error); } static int __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap) { return (kern_umtx_wake(td, uap->obj, uap->val, 1)); } static int __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, 0)); } static int __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY)); } static int __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT)); } static int __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_wake_umutex(td, uap->obj)); } static int __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_unlock_umutex(td, uap->obj, false)); } static int __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap) { return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1)); } static int __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = umtx_copyin_timeout(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap) { return (do_cv_signal(td, uap->obj)); } static int __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap) { return (do_cv_broadcast(td, uap->obj)); } static int __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap) { return (do_rw_unlock(td, uap->obj)); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } static int __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap) { return (do_sem_wake(td, uap->obj)); } #endif static int __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_wake2_umutex(td, uap->obj, uap->val)); } static int __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; size_t uasize; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { uasize = 0; tm_p = NULL; } else { uasize = (size_t)uap->uaddr1; error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout); if (error != 0) return (error); tm_p = &timeout; } error = do_sem2_wait(td, uap->obj, tm_p); if (error == EINTR && uap->uaddr2 != NULL && (timeout._flags & UMTX_ABSTIME) == 0 && uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) { error = copyout(&timeout._timeout, (struct _umtx_time *)uap->uaddr2 + 1, sizeof(struct timespec)); if (error == 0) { error = EINTR; } } return (error); } static int __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap) { return (do_sem2_wake(td, uap->obj)); } #define USHM_OBJ_UMTX(o) \ ((struct umtx_shm_obj_list *)(&(o)->umtx_data)) #define USHMF_REG_LINKED 0x0001 #define USHMF_OBJ_LINKED 0x0002 struct umtx_shm_reg { TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link; LIST_ENTRY(umtx_shm_reg) ushm_obj_link; struct umtx_key ushm_key; struct ucred *ushm_cred; struct shmfd *ushm_obj; u_int ushm_refcnt; u_int ushm_flags; }; LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg); TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg); static uma_zone_t umtx_shm_reg_zone; static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS]; static struct mtx umtx_shm_lock; static struct umtx_shm_reg_head umtx_shm_reg_delfree = TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree); static void umtx_shm_free_reg(struct umtx_shm_reg *reg); static void umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused) { struct umtx_shm_reg_head d; struct umtx_shm_reg *reg, *reg1; TAILQ_INIT(&d); mtx_lock(&umtx_shm_lock); TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link); mtx_unlock(&umtx_shm_lock); TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) { TAILQ_REMOVE(&d, reg, ushm_reg_link); umtx_shm_free_reg(reg); } } static struct task umtx_shm_reg_delfree_task = TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL); static struct umtx_shm_reg * umtx_shm_find_reg_locked(const struct umtx_key *key) { struct umtx_shm_reg *reg; struct umtx_shm_reg_head *reg_head; KASSERT(key->shared, ("umtx_p_find_rg: private key")); mtx_assert(&umtx_shm_lock, MA_OWNED); reg_head = &umtx_shm_registry[key->hash]; TAILQ_FOREACH(reg, reg_head, ushm_reg_link) { KASSERT(reg->ushm_key.shared, ("non-shared key on reg %p %d", reg, reg->ushm_key.shared)); if (reg->ushm_key.info.shared.object == key->info.shared.object && reg->ushm_key.info.shared.offset == key->info.shared.offset) { KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM")); KASSERT(reg->ushm_refcnt > 0, ("reg %p refcnt 0 onlist", reg)); KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0, ("reg %p not linked", reg)); reg->ushm_refcnt++; return (reg); } } return (NULL); } static struct umtx_shm_reg * umtx_shm_find_reg(const struct umtx_key *key) { struct umtx_shm_reg *reg; mtx_lock(&umtx_shm_lock); reg = umtx_shm_find_reg_locked(key); mtx_unlock(&umtx_shm_lock); return (reg); } static void umtx_shm_free_reg(struct umtx_shm_reg *reg) { chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0); crfree(reg->ushm_cred); shm_drop(reg->ushm_obj); uma_zfree(umtx_shm_reg_zone, reg); } static bool umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force) { bool res; mtx_assert(&umtx_shm_lock, MA_OWNED); KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg)); reg->ushm_refcnt--; res = reg->ushm_refcnt == 0; if (res || force) { if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) { TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash], reg, ushm_reg_link); reg->ushm_flags &= ~USHMF_REG_LINKED; } if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) { LIST_REMOVE(reg, ushm_obj_link); reg->ushm_flags &= ~USHMF_OBJ_LINKED; } } return (res); } static void umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force) { vm_object_t object; bool dofree; if (force) { object = reg->ushm_obj->shm_object; VM_OBJECT_WLOCK(object); object->flags |= OBJ_UMTXDEAD; VM_OBJECT_WUNLOCK(object); } mtx_lock(&umtx_shm_lock); dofree = umtx_shm_unref_reg_locked(reg, force); mtx_unlock(&umtx_shm_lock); if (dofree) umtx_shm_free_reg(reg); } void umtx_shm_object_init(vm_object_t object) { LIST_INIT(USHM_OBJ_UMTX(object)); } void umtx_shm_object_terminated(vm_object_t object) { struct umtx_shm_reg *reg, *reg1; bool dofree; dofree = false; mtx_lock(&umtx_shm_lock); LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) { if (umtx_shm_unref_reg_locked(reg, true)) { TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg, ushm_reg_link); dofree = true; } } mtx_unlock(&umtx_shm_lock); if (dofree) taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task); } static int umtx_shm_create_reg(struct thread *td, const struct umtx_key *key, struct umtx_shm_reg **res) { struct umtx_shm_reg *reg, *reg1; struct ucred *cred; int error; reg = umtx_shm_find_reg(key); if (reg != NULL) { *res = reg; return (0); } cred = td->td_ucred; if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP))) return (ENOMEM); reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); reg->ushm_refcnt = 1; bcopy(key, ®->ushm_key, sizeof(*key)); reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR); reg->ushm_cred = crhold(cred); error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); if (error != 0) { umtx_shm_free_reg(reg); return (error); } mtx_lock(&umtx_shm_lock); reg1 = umtx_shm_find_reg_locked(key); if (reg1 != NULL) { mtx_unlock(&umtx_shm_lock); umtx_shm_free_reg(reg); *res = reg1; return (0); } reg->ushm_refcnt++; TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link); LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg, ushm_obj_link); reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED; mtx_unlock(&umtx_shm_lock); *res = reg; return (0); } static int umtx_shm_alive(struct thread *td, void *addr) { vm_map_t map; vm_map_entry_t entry; vm_object_t object; vm_pindex_t pindex; vm_prot_t prot; int res, ret; boolean_t wired; map = &td->td_proc->p_vmspace->vm_map; res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry, &object, &pindex, &prot, &wired); if (res != KERN_SUCCESS) return (EFAULT); if (object == NULL) ret = EINVAL; else ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0; vm_map_lookup_done(map, entry); return (ret); } static void umtx_shm_init(void) { int i; umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF); for (i = 0; i < nitems(umtx_shm_registry); i++) TAILQ_INIT(&umtx_shm_registry[i]); } static int umtx_shm(struct thread *td, void *addr, u_int flags) { struct umtx_key key; struct umtx_shm_reg *reg; struct file *fp; int error, fd; if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP | UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1) return (EINVAL); if ((flags & UMTX_SHM_ALIVE) != 0) return (umtx_shm_alive(td, addr)); error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key); if (error != 0) return (error); KASSERT(key.shared == 1, ("non-shared key")); if ((flags & UMTX_SHM_CREAT) != 0) { error = umtx_shm_create_reg(td, &key, ®); } else { reg = umtx_shm_find_reg(&key); if (reg == NULL) error = ESRCH; } umtx_key_release(&key); if (error != 0) return (error); KASSERT(reg != NULL, ("no reg")); if ((flags & UMTX_SHM_DESTROY) != 0) { umtx_shm_unref_reg(reg, true); } else { #if 0 #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, reg->ushm_obj, FFLAGS(O_RDWR)); if (error == 0) #endif error = shm_access(reg->ushm_obj, td->td_ucred, FFLAGS(O_RDWR)); if (error == 0) #endif error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL); if (error == 0) { shm_hold(reg->ushm_obj); finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); } } umtx_shm_unref_reg(reg, false); return (error); } static int __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap) { return (umtx_shm(td, uap->uaddr1, uap->val)); } static int umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp) { td->td_rb_list = rbp->robust_list_offset; td->td_rbp_list = rbp->robust_priv_list_offset; td->td_rb_inact = rbp->robust_inact_offset; return (0); } static int __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap) { struct umtx_robust_lists_params rb; int error; if (uap->val > sizeof(rb)) return (EINVAL); bzero(&rb, sizeof(rb)); error = copyin(uap->uaddr1, &rb, uap->val); if (error != 0) return (error); return (umtx_robust_lists(td, &rb)); } typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap); static const _umtx_op_func op_table[] = { [UMTX_OP_RESERVED0] = __umtx_op_unimpl, [UMTX_OP_RESERVED1] = __umtx_op_unimpl, [UMTX_OP_WAIT] = __umtx_op_wait, [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_LOCK] = __umtx_op_lock_umutex, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_uint, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, [UMTX_OP_SHM] = __umtx_op_shm, [UMTX_OP_ROBUST_LISTS] = __umtx_op_robust_lists, }; int sys__umtx_op(struct thread *td, struct _umtx_op_args *uap) { if ((unsigned)uap->op < nitems(op_table)) return (*op_table[uap->op])(td, uap); return (EINVAL); } #ifdef COMPAT_FREEBSD32 struct timespec32 { int32_t tv_sec; int32_t tv_nsec; }; struct umtx_time32 { struct timespec32 timeout; uint32_t flags; uint32_t clockid; }; static inline int umtx_copyin_timeout32(void *addr, struct timespec *tsp) { struct timespec32 ts32; int error; error = copyin(addr, &ts32, sizeof(struct timespec32)); if (error == 0) { if (ts32.tv_sec < 0 || ts32.tv_nsec >= 1000000000 || ts32.tv_nsec < 0) error = EINVAL; else { tsp->tv_sec = ts32.tv_sec; tsp->tv_nsec = ts32.tv_nsec; } } return (error); } static inline int umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp) { struct umtx_time32 t32; int error; t32.clockid = CLOCK_REALTIME; t32.flags = 0; if (size <= sizeof(struct timespec32)) error = copyin(addr, &t32.timeout, sizeof(struct timespec32)); else error = copyin(addr, &t32, sizeof(struct umtx_time32)); if (error != 0) return (error); if (t32.timeout.tv_sec < 0 || t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0) return (EINVAL); tp->_timeout.tv_sec = t32.timeout.tv_sec; tp->_timeout.tv_nsec = t32.timeout.tv_nsec; tp->_flags = t32.flags; tp->_clockid = t32.clockid; return (0); } static int __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0)); } static int __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, 0)); } static int __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT)); } static int __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = umtx_copyin_timeout32(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32( uap->uaddr2, (size_t)uap->uaddr1,&timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1)); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } #endif static int __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; size_t uasize; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { uasize = 0; tm_p = NULL; } else { uasize = (size_t)uap->uaddr1; error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout); if (error != 0) return (error); tm_p = &timeout; } error = do_sem2_wait(td, uap->obj, tm_p); if (error == EINTR && uap->uaddr2 != NULL && (timeout._flags & UMTX_ABSTIME) == 0 && uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) { struct timespec32 remain32 = { .tv_sec = timeout._timeout.tv_sec, .tv_nsec = timeout._timeout.tv_nsec }; error = copyout(&remain32, (struct umtx_time32 *)uap->uaddr2 + 1, sizeof(struct timespec32)); if (error == 0) { error = EINTR; } } return (error); } static int __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap) { uint32_t uaddrs[BATCH_SIZE], **upp; int count, error, i, pos, tocopy; upp = (uint32_t **)uap->obj; error = 0; for (count = uap->val, pos = 0; count > 0; count -= tocopy, pos += tocopy) { tocopy = MIN(count, BATCH_SIZE); error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t)); if (error != 0) break; for (i = 0; i < tocopy; ++i) kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i], INT_MAX, 1); maybe_yield(); } return (error); } struct umtx_robust_lists_params_compat32 { uint32_t robust_list_offset; uint32_t robust_priv_list_offset; uint32_t robust_inact_offset; }; static int __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap) { struct umtx_robust_lists_params rb; struct umtx_robust_lists_params_compat32 rb32; int error; if (uap->val > sizeof(rb32)) return (EINVAL); bzero(&rb, sizeof(rb)); bzero(&rb32, sizeof(rb32)); error = copyin(uap->uaddr1, &rb32, uap->val); if (error != 0) return (error); rb.robust_list_offset = rb32.robust_list_offset; rb.robust_priv_list_offset = rb32.robust_priv_list_offset; rb.robust_inact_offset = rb32.robust_inact_offset; return (umtx_robust_lists(td, &rb)); } static const _umtx_op_func op_table_compat32[] = { [UMTX_OP_RESERVED0] = __umtx_op_unimpl, [UMTX_OP_RESERVED1] = __umtx_op_unimpl, [UMTX_OP_WAIT] = __umtx_op_wait_compat32, [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_LOCK] = __umtx_op_lock_umutex_compat32, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait_compat32, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_compat32, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock_compat32, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock_compat32, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex_compat32, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait_compat32, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private32, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait_compat32, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, [UMTX_OP_SHM] = __umtx_op_shm, [UMTX_OP_ROBUST_LISTS] = __umtx_op_robust_lists_compat32, }; int freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap) { if ((unsigned)uap->op < nitems(op_table_compat32)) { return (*op_table_compat32[uap->op])(td, (struct _umtx_op_args *)uap); } return (EINVAL); } #endif void umtx_thread_init(struct thread *td) { td->td_umtxq = umtxq_alloc(); td->td_umtxq->uq_thread = td; } void umtx_thread_fini(struct thread *td) { umtxq_free(td->td_umtxq); } /* * It will be called when new thread is created, e.g fork(). */ void umtx_thread_alloc(struct thread *td) { struct umtx_q *uq; uq = td->td_umtxq; uq->uq_inherited_pri = PRI_MAX; KASSERT(uq->uq_flags == 0, ("uq_flags != 0")); KASSERT(uq->uq_thread == td, ("uq_thread != td")); KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL")); KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty")); } /* * exec() hook. * * Clear robust lists for all process' threads, not delaying the * cleanup to thread_exit hook, since the relevant address space is * destroyed right now. */ static void umtx_exec_hook(void *arg __unused, struct proc *p, struct image_params *imgp __unused) { struct thread *td; KASSERT(p == curproc, ("need curproc")); PROC_LOCK(p); KASSERT((p->p_flag & P_HADTHREADS) == 0 || (p->p_flag & P_STOPPED_SINGLE) != 0, ("curproc must be single-threaded")); FOREACH_THREAD_IN_PROC(p, td) { KASSERT(td == curthread || ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)), ("running thread %p %p", p, td)); PROC_UNLOCK(p); umtx_thread_cleanup(td); PROC_LOCK(p); td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0; } PROC_UNLOCK(p); } /* * thread_exit() hook. */ void umtx_thread_exit(struct thread *td) { umtx_thread_cleanup(td); } static int umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res) { u_long res1; #ifdef COMPAT_FREEBSD32 uint32_t res32; #endif int error; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { error = fueword32((void *)ptr, &res32); if (error == 0) res1 = res32; } else #endif { error = fueword((void *)ptr, &res1); } if (error == 0) *res = res1; else error = EFAULT; return (error); } static void umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list) { #ifdef COMPAT_FREEBSD32 struct umutex32 m32; if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { memcpy(&m32, m, sizeof(m32)); *rb_list = m32.m_rb_lnk; } else #endif *rb_list = m->m_rb_lnk; } static int umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact) { struct umutex m; int error; KASSERT(td->td_proc == curproc, ("need current vmspace")); error = copyin((void *)rbp, &m, sizeof(m)); if (error != 0) return (error); if (rb_list != NULL) umtx_read_rb_list(td, &m, rb_list); if ((m.m_flags & UMUTEX_ROBUST) == 0) return (EINVAL); if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid) /* inact is cleared after unlock, allow the inconsistency */ return (inact ? 0 : EINVAL); return (do_unlock_umutex(td, (struct umutex *)rbp, true)); } static void umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact, const char *name) { int error, i; uintptr_t rbp; bool inact; if (rb_list == 0) return; error = umtx_read_uptr(td, rb_list, &rbp); for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) { if (rbp == *rb_inact) { inact = true; *rb_inact = 0; } else inact = false; error = umtx_handle_rb(td, rbp, &rbp, inact); } if (i == umtx_max_rb && umtx_verbose_rb) { uprintf("comm %s pid %d: reached umtx %smax rb %d\n", td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb); } if (error != 0 && umtx_verbose_rb) { uprintf("comm %s pid %d: handling %srb error %d\n", td->td_proc->p_comm, td->td_proc->p_pid, name, error); } } /* * Clean up umtx data. */ static void umtx_thread_cleanup(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; uintptr_t rb_inact; /* * Disown pi mutexes. */ uq = td->td_umtxq; if (uq != NULL) { mtx_lock(&umtx_lock); uq->uq_inherited_pri = PRI_MAX; while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { pi->pi_owner = NULL; TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); } mtx_unlock(&umtx_lock); thread_lock(td); sched_lend_user_prio(td, PRI_MAX); thread_unlock(td); } /* * Handle terminated robust mutexes. Must be done after * robust pi disown, otherwise unlock could see unowned * entries. */ rb_inact = td->td_rb_inact; if (rb_inact != 0) (void)umtx_read_uptr(td, rb_inact, &rb_inact); umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, ""); umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv "); if (rb_inact != 0) (void)umtx_handle_rb(td, rb_inact, NULL, true); } Index: head/sys/kern/kern_uuid.c =================================================================== --- head/sys/kern/kern_uuid.c (revision 326270) +++ head/sys/kern/kern_uuid.c (revision 326271) @@ -1,433 +1,435 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * See also: * http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt * http://www.opengroup.org/onlinepubs/009629399/apdxa.htm * * Note that the generator state is itself an UUID, but the time and clock * sequence fields are written in the native byte order. */ CTASSERT(sizeof(struct uuid) == 16); /* We use an alternative, more convenient representation in the generator. */ struct uuid_private { union { uint64_t ll; /* internal, for uuid_last only */ struct { uint32_t low; uint16_t mid; uint16_t hi; } x; } time; uint16_t seq; /* Big-endian. */ uint16_t node[UUID_NODE_LEN>>1]; }; CTASSERT(sizeof(struct uuid_private) == 16); struct uuid_macaddr { uint16_t state; #define UUID_ETHER_EMPTY 0 #define UUID_ETHER_RANDOM 1 #define UUID_ETHER_UNIQUE 2 uint16_t node[UUID_NODE_LEN>>1]; }; static struct uuid_private uuid_last; #define UUID_NETHER 4 static struct uuid_macaddr uuid_ether[UUID_NETHER]; static struct mtx uuid_mutex; MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF); /* * Return the first MAC address added in the array. If it's empty, then * construct a sufficiently random multicast MAC address first. Any * addresses added later will bump the random MAC address up tp the next * index. */ static void uuid_node(uint16_t *node) { int i; if (uuid_ether[0].state == UUID_ETHER_EMPTY) { for (i = 0; i < (UUID_NODE_LEN>>1); i++) uuid_ether[0].node[i] = (uint16_t)arc4random(); *((uint8_t*)uuid_ether[0].node) |= 0x01; uuid_ether[0].state = UUID_ETHER_RANDOM; } for (i = 0; i < (UUID_NODE_LEN>>1); i++) node[i] = uuid_ether[0].node[i]; } /* * Get the current time as a 60 bit count of 100-nanosecond intervals * since 00:00:00.00, October 15,1582. We apply a magic offset to convert * the Unix time since 00:00:00.00, January 1, 1970 to the date of the * Gregorian reform to the Christian calendar. */ static uint64_t uuid_time(void) { struct bintime bt; uint64_t time = 0x01B21DD213814000LL; bintime(&bt); time += (uint64_t)bt.sec * 10000000LL; time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32; return (time & ((1LL << 60) - 1LL)); } struct uuid * kern_uuidgen(struct uuid *store, size_t count) { struct uuid_private uuid; uint64_t time; size_t n; mtx_lock(&uuid_mutex); uuid_node(uuid.node); time = uuid_time(); if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] || uuid_last.node[1] != uuid.node[1] || uuid_last.node[2] != uuid.node[2]) uuid.seq = (uint16_t)arc4random() & 0x3fff; else if (uuid_last.time.ll >= time) uuid.seq = (uuid_last.seq + 1) & 0x3fff; else uuid.seq = uuid_last.seq; uuid_last = uuid; uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL); mtx_unlock(&uuid_mutex); /* Set sequence and variant and deal with byte order. */ uuid.seq = htobe16(uuid.seq | 0x8000); for (n = 0; n < count; n++) { /* Set time and version (=1). */ uuid.time.x.low = (uint32_t)time; uuid.time.x.mid = (uint16_t)(time >> 32); uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12); store[n] = *(struct uuid *)&uuid; time++; } return (store); } #ifndef _SYS_SYSPROTO_H_ struct uuidgen_args { struct uuid *store; int count; }; #endif int sys_uuidgen(struct thread *td, struct uuidgen_args *uap) { struct uuid *store; size_t count; int error; /* * Limit the number of UUIDs that can be created at the same time * to some arbitrary number. This isn't really necessary, but I * like to have some sort of upper-bound that's less than 2G :-) * XXX probably needs to be tunable. */ if (uap->count < 1 || uap->count > 2048) return (EINVAL); count = uap->count; store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK); kern_uuidgen(store, count); error = copyout(store, uap->store, count * sizeof(struct uuid)); free(store, M_TEMP); return (error); } int uuid_ether_add(const uint8_t *addr) { int i, sum; /* * Validate input. No multicast (flag 0x1), no locally administered * (flag 0x2) and no 'all-zeroes' addresses. */ if (addr[0] & 0x03) return (EINVAL); sum = 0; for (i = 0; i < UUID_NODE_LEN; i++) sum += addr[i]; if (sum == 0) return (EINVAL); mtx_lock(&uuid_mutex); /* Make sure the MAC isn't known already and that there's space. */ i = 0; while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE) { if (!bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN)) { mtx_unlock(&uuid_mutex); return (EEXIST); } i++; } if (i == UUID_NETHER) { mtx_unlock(&uuid_mutex); return (ENOSPC); } /* Insert MAC at index, moving the non-empty entry if possible. */ if (uuid_ether[i].state == UUID_ETHER_RANDOM && i < UUID_NETHER - 1) uuid_ether[i + 1] = uuid_ether[i]; uuid_ether[i].state = UUID_ETHER_UNIQUE; bcopy(addr, uuid_ether[i].node, UUID_NODE_LEN); mtx_unlock(&uuid_mutex); return (0); } int uuid_ether_del(const uint8_t *addr) { int i; mtx_lock(&uuid_mutex); i = 0; while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE && bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN)) i++; if (i == UUID_NETHER || uuid_ether[i].state != UUID_ETHER_UNIQUE) { mtx_unlock(&uuid_mutex); return (ENOENT); } /* Remove it by shifting higher index entries down. */ while (i < UUID_NETHER - 1 && uuid_ether[i].state != UUID_ETHER_EMPTY) { uuid_ether[i] = uuid_ether[i + 1]; i++; } if (uuid_ether[i].state != UUID_ETHER_EMPTY) { uuid_ether[i].state = UUID_ETHER_EMPTY; bzero(uuid_ether[i].node, UUID_NODE_LEN); } mtx_unlock(&uuid_mutex); return (0); } int snprintf_uuid(char *buf, size_t sz, struct uuid *uuid) { struct uuid_private *id; int cnt; id = (struct uuid_private *)uuid; cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x", id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq), be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2])); return (cnt); } int printf_uuid(struct uuid *uuid) { char buf[38]; snprintf_uuid(buf, sizeof(buf), uuid); return (printf("%s", buf)); } int sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid) { char buf[38]; snprintf_uuid(buf, sizeof(buf), uuid); return (sbuf_printf(sb, "%s", buf)); } /* * Encode/Decode UUID into byte-stream. * http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | time_low | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | time_mid | time_hi_and_version | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |clk_seq_hi_res | clk_seq_low | node (0-1) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | node (2-5) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ void le_uuid_enc(void *buf, struct uuid const *uuid) { u_char *p; int i; p = buf; le32enc(p, uuid->time_low); le16enc(p + 4, uuid->time_mid); le16enc(p + 6, uuid->time_hi_and_version); p[8] = uuid->clock_seq_hi_and_reserved; p[9] = uuid->clock_seq_low; for (i = 0; i < _UUID_NODE_LEN; i++) p[10 + i] = uuid->node[i]; } void le_uuid_dec(void const *buf, struct uuid *uuid) { u_char const *p; int i; p = buf; uuid->time_low = le32dec(p); uuid->time_mid = le16dec(p + 4); uuid->time_hi_and_version = le16dec(p + 6); uuid->clock_seq_hi_and_reserved = p[8]; uuid->clock_seq_low = p[9]; for (i = 0; i < _UUID_NODE_LEN; i++) uuid->node[i] = p[10 + i]; } void be_uuid_enc(void *buf, struct uuid const *uuid) { u_char *p; int i; p = buf; be32enc(p, uuid->time_low); be16enc(p + 4, uuid->time_mid); be16enc(p + 6, uuid->time_hi_and_version); p[8] = uuid->clock_seq_hi_and_reserved; p[9] = uuid->clock_seq_low; for (i = 0; i < _UUID_NODE_LEN; i++) p[10 + i] = uuid->node[i]; } void be_uuid_dec(void const *buf, struct uuid *uuid) { u_char const *p; int i; p = buf; uuid->time_low = be32dec(p); uuid->time_mid = be16dec(p + 4); uuid->time_hi_and_version = be16dec(p + 6); uuid->clock_seq_hi_and_reserved = p[8]; uuid->clock_seq_low = p[9]; for (i = 0; i < _UUID_NODE_LEN; i++) uuid->node[i] = p[10 + i]; } int parse_uuid(const char *str, struct uuid *uuid) { u_int c[11]; int n; /* An empty string represents a nil UUID. */ if (*str == '\0') { bzero(uuid, sizeof(*uuid)); return (0); } /* The UUID string representation has a fixed length. */ if (strlen(str) != 36) return (EINVAL); /* * We only work with "new" UUIDs. New UUIDs have the form: * 01234567-89ab-cdef-0123-456789abcdef * The so called "old" UUIDs, which we don't support, have the form: * 0123456789ab.cd.ef.01.23.45.67.89.ab */ if (str[8] != '-') return (EINVAL); n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1, c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10); /* Make sure we have all conversions. */ if (n != 11) return (EINVAL); /* Successful scan. Build the UUID. */ uuid->time_low = c[0]; uuid->time_mid = c[1]; uuid->time_hi_and_version = c[2]; uuid->clock_seq_hi_and_reserved = c[3]; uuid->clock_seq_low = c[4]; for (n = 0; n < 6; n++) uuid->node[n] = c[n + 5]; /* Check semantics... */ return (((c[3] & 0x80) != 0x00 && /* variant 0? */ (c[3] & 0xc0) != 0x80 && /* variant 1? */ (c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */ } int uuidcmp(const struct uuid *uuid1, const struct uuid *uuid2) { return (memcmp(uuid1, uuid2, sizeof(struct uuid))); } Index: head/sys/kern/link_elf.c =================================================================== --- head/sys/kern/link_elf.c (revision 326270) +++ head/sys/kern/link_elf.c (revision 326271) @@ -1,1658 +1,1660 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1998-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_gdb.h" #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SPARSE_MAPPING #include #include #include #endif #include #include #include #ifdef DDB_CTF #include #endif #include "linker_if.h" #define MAXSEGS 4 typedef struct elf_file { struct linker_file lf; /* Common fields */ int preloaded; /* Was file pre-loaded */ caddr_t address; /* Relocation address */ #ifdef SPARSE_MAPPING vm_object_t object; /* VM object to hold file pages */ #endif Elf_Dyn *dynamic; /* Symbol table etc. */ Elf_Hashelt nbuckets; /* DT_HASH info */ Elf_Hashelt nchains; const Elf_Hashelt *buckets; const Elf_Hashelt *chains; caddr_t hash; caddr_t strtab; /* DT_STRTAB */ int strsz; /* DT_STRSZ */ const Elf_Sym *symtab; /* DT_SYMTAB */ Elf_Addr *got; /* DT_PLTGOT */ const Elf_Rel *pltrel; /* DT_JMPREL */ int pltrelsize; /* DT_PLTRELSZ */ const Elf_Rela *pltrela; /* DT_JMPREL */ int pltrelasize; /* DT_PLTRELSZ */ const Elf_Rel *rel; /* DT_REL */ int relsize; /* DT_RELSZ */ const Elf_Rela *rela; /* DT_RELA */ int relasize; /* DT_RELASZ */ caddr_t modptr; const Elf_Sym *ddbsymtab; /* The symbol table we are using */ long ddbsymcnt; /* Number of symbols */ caddr_t ddbstrtab; /* String table */ long ddbstrcnt; /* number of bytes in string table */ caddr_t symbase; /* malloc'ed symbold base */ caddr_t strbase; /* malloc'ed string base */ caddr_t ctftab; /* CTF table */ long ctfcnt; /* number of bytes in CTF table */ caddr_t ctfoff; /* CTF offset table */ caddr_t typoff; /* Type offset table */ long typlen; /* Number of type entries. */ Elf_Addr pcpu_start; /* Pre-relocation pcpu set start. */ Elf_Addr pcpu_stop; /* Pre-relocation pcpu set stop. */ Elf_Addr pcpu_base; /* Relocated pcpu set address. */ #ifdef VIMAGE Elf_Addr vnet_start; /* Pre-relocation vnet set start. */ Elf_Addr vnet_stop; /* Pre-relocation vnet set stop. */ Elf_Addr vnet_base; /* Relocated vnet set address. */ #endif #ifdef GDB struct link_map gdb; /* hooks for gdb */ #endif } *elf_file_t; struct elf_set { Elf_Addr es_start; Elf_Addr es_stop; Elf_Addr es_base; TAILQ_ENTRY(elf_set) es_link; }; TAILQ_HEAD(elf_set_head, elf_set); #include static int link_elf_link_common_finish(linker_file_t); static int link_elf_link_preload(linker_class_t cls, const char *, linker_file_t *); static int link_elf_link_preload_finish(linker_file_t); static int link_elf_load_file(linker_class_t, const char *, linker_file_t *); static int link_elf_lookup_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_search_symbol(linker_file_t, caddr_t, c_linker_sym_t *, long *); static void link_elf_unload_file(linker_file_t); static void link_elf_unload_preload(linker_file_t); static int link_elf_lookup_set(linker_file_t, const char *, void ***, void ***, int *); static int link_elf_each_function_name(linker_file_t, int (*)(const char *, void *), void *); static int link_elf_each_function_nameval(linker_file_t, linker_function_nameval_callback_t, void *); static void link_elf_reloc_local(linker_file_t); static long link_elf_symtab_get(linker_file_t, const Elf_Sym **); static long link_elf_strtab_get(linker_file_t, caddr_t *); static int elf_lookup(linker_file_t, Elf_Size, int, Elf_Addr *); static kobj_method_t link_elf_methods[] = { KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), KOBJMETHOD(linker_unload, link_elf_unload_file), KOBJMETHOD(linker_load_file, link_elf_load_file), KOBJMETHOD(linker_link_preload, link_elf_link_preload), KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval), KOBJMETHOD(linker_ctf_get, link_elf_ctf_get), KOBJMETHOD(linker_symtab_get, link_elf_symtab_get), KOBJMETHOD(linker_strtab_get, link_elf_strtab_get), { 0, 0 } }; static struct linker_class link_elf_class = { #if ELF_TARG_CLASS == ELFCLASS32 "elf32", #else "elf64", #endif link_elf_methods, sizeof(struct elf_file) }; static int parse_dynamic(elf_file_t); static int relocate_file(elf_file_t); static int link_elf_preload_parse_symbols(elf_file_t); static struct elf_set_head set_pcpu_list; #ifdef VIMAGE static struct elf_set_head set_vnet_list; #endif static void elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base) { struct elf_set *set, *iter; set = malloc(sizeof(*set), M_LINKER, M_WAITOK); set->es_start = start; set->es_stop = stop; set->es_base = base; TAILQ_FOREACH(iter, list, es_link) { KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) || (set->es_start > iter->es_start && set->es_stop > iter->es_stop), ("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx", (uintmax_t)set->es_start, (uintmax_t)set->es_stop, (uintmax_t)iter->es_start, (uintmax_t)iter->es_stop)); if (iter->es_start > set->es_start) { TAILQ_INSERT_BEFORE(iter, set, es_link); break; } } if (iter == NULL) TAILQ_INSERT_TAIL(list, set, es_link); } static int elf_set_find(struct elf_set_head *list, Elf_Addr addr, Elf_Addr *start, Elf_Addr *base) { struct elf_set *set; TAILQ_FOREACH(set, list, es_link) { if (addr < set->es_start) return (0); if (addr < set->es_stop) { *start = set->es_start; *base = set->es_base; return (1); } } return (0); } static void elf_set_delete(struct elf_set_head *list, Elf_Addr start) { struct elf_set *set; TAILQ_FOREACH(set, list, es_link) { if (start < set->es_start) break; if (start == set->es_start) { TAILQ_REMOVE(list, set, es_link); free(set, M_LINKER); return; } } KASSERT(0, ("deleting unknown linker set (start = 0x%jx)", (uintmax_t)start)); } #ifdef GDB static void r_debug_state(struct r_debug *, struct link_map *); /* * A list of loaded modules for GDB to use for loading symbols. */ struct r_debug r_debug; #define GDB_STATE(s) do { \ r_debug.r_state = s; r_debug_state(NULL, NULL); \ } while (0) /* * Function for the debugger to set a breakpoint on to gain control. */ static void r_debug_state(struct r_debug *dummy_one __unused, struct link_map *dummy_two __unused) { } static void link_elf_add_gdb(struct link_map *l) { struct link_map *prev; l->l_next = NULL; if (r_debug.r_map == NULL) { /* Add first. */ l->l_prev = NULL; r_debug.r_map = l; } else { /* Append to list. */ for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next) ; l->l_prev = prev; prev->l_next = l; } } static void link_elf_delete_gdb(struct link_map *l) { if (l->l_prev == NULL) { /* Remove first. */ if ((r_debug.r_map = l->l_next) != NULL) l->l_next->l_prev = NULL; } else { /* Remove any but first. */ if ((l->l_prev->l_next = l->l_next) != NULL) l->l_next->l_prev = l->l_prev; } } #endif /* GDB */ /* * The kernel symbol table starts here. */ extern struct _dynamic _DYNAMIC; static void link_elf_error(const char *filename, const char *s) { if (filename == NULL) printf("kldload: %s\n", s); else printf("kldload: %s: %s\n", filename, s); } static void link_elf_invoke_ctors(caddr_t addr, size_t size) { void (**ctor)(void); size_t i, cnt; if (addr == NULL || size == 0) return; cnt = size / sizeof(*ctor); ctor = (void *)addr; for (i = 0; i < cnt; i++) { if (ctor[i] != NULL) (*ctor[i])(); } } /* * Actions performed after linking/loading both the preloaded kernel and any * modules; whether preloaded or dynamicly loaded. */ static int link_elf_link_common_finish(linker_file_t lf) { #ifdef GDB elf_file_t ef = (elf_file_t)lf; char *newfilename; #endif int error; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error != 0) return (error); #ifdef GDB GDB_STATE(RT_ADD); ef->gdb.l_addr = lf->address; newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK); strcpy(newfilename, lf->filename); ef->gdb.l_name = newfilename; ef->gdb.l_ld = ef->dynamic; link_elf_add_gdb(&ef->gdb); GDB_STATE(RT_CONSISTENT); #endif /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); return (0); } extern vm_offset_t __startkernel; static void link_elf_init(void* arg) { Elf_Dyn *dp; Elf_Addr *ctors_addrp; Elf_Size *ctors_sizep; caddr_t modptr, baseptr, sizeptr; elf_file_t ef; char *modname; linker_add_class(&link_elf_class); dp = (Elf_Dyn *)&_DYNAMIC; modname = NULL; modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel"); if (modptr == NULL) modptr = preload_search_by_type("elf kernel"); modname = (char *)preload_search_info(modptr, MODINFO_NAME); if (modname == NULL) modname = "kernel"; linker_kernel_file = linker_make_file(modname, &link_elf_class); if (linker_kernel_file == NULL) panic("%s: Can't create linker structures for kernel", __func__); ef = (elf_file_t) linker_kernel_file; ef->preloaded = 1; #ifdef __powerpc__ ef->address = (caddr_t) (__startkernel - KERNBASE); #else ef->address = 0; #endif #ifdef SPARSE_MAPPING ef->object = 0; #endif ef->dynamic = dp; if (dp != NULL) parse_dynamic(ef); linker_kernel_file->address += KERNBASE; linker_kernel_file->size = -(intptr_t)linker_kernel_file->address; if (modptr != NULL) { ef->modptr = modptr; baseptr = preload_search_info(modptr, MODINFO_ADDR); if (baseptr != NULL) linker_kernel_file->address = *(caddr_t *)baseptr; sizeptr = preload_search_info(modptr, MODINFO_SIZE); if (sizeptr != NULL) linker_kernel_file->size = *(size_t *)sizeptr; ctors_addrp = (Elf_Addr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_ADDR); ctors_sizep = (Elf_Size *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_SIZE); if (ctors_addrp != NULL && ctors_sizep != NULL) { linker_kernel_file->ctors_addr = ef->address + *ctors_addrp; linker_kernel_file->ctors_size = *ctors_sizep; } } (void)link_elf_preload_parse_symbols(ef); #ifdef GDB r_debug.r_map = NULL; r_debug.r_brk = r_debug_state; r_debug.r_state = RT_CONSISTENT; #endif (void)link_elf_link_common_finish(linker_kernel_file); linker_kernel_file->flags |= LINKER_FILE_LINKED; TAILQ_INIT(&set_pcpu_list); #ifdef VIMAGE TAILQ_INIT(&set_vnet_list); #endif } SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0); static int link_elf_preload_parse_symbols(elf_file_t ef) { caddr_t pointer; caddr_t ssym, esym, base; caddr_t strtab; int strcnt; Elf_Sym *symtab; int symcnt; if (ef->modptr == NULL) return (0); pointer = preload_search_info(ef->modptr, MODINFO_METADATA | MODINFOMD_SSYM); if (pointer == NULL) return (0); ssym = *(caddr_t *)pointer; pointer = preload_search_info(ef->modptr, MODINFO_METADATA | MODINFOMD_ESYM); if (pointer == NULL) return (0); esym = *(caddr_t *)pointer; base = ssym; symcnt = *(long *)base; base += sizeof(long); symtab = (Elf_Sym *)base; base += roundup(symcnt, sizeof(long)); if (base > esym || base < ssym) { printf("Symbols are corrupt!\n"); return (EINVAL); } strcnt = *(long *)base; base += sizeof(long); strtab = base; base += roundup(strcnt, sizeof(long)); if (base > esym || base < ssym) { printf("Symbols are corrupt!\n"); return (EINVAL); } ef->ddbsymtab = symtab; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbstrtab = strtab; ef->ddbstrcnt = strcnt; return (0); } static int parse_dynamic(elf_file_t ef) { Elf_Dyn *dp; int plttype = DT_REL; for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { switch (dp->d_tag) { case DT_HASH: { /* From src/libexec/rtld-elf/rtld.c */ const Elf_Hashelt *hashtab = (const Elf_Hashelt *) (ef->address + dp->d_un.d_ptr); ef->nbuckets = hashtab[0]; ef->nchains = hashtab[1]; ef->buckets = hashtab + 2; ef->chains = ef->buckets + ef->nbuckets; break; } case DT_STRTAB: ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr); break; case DT_STRSZ: ef->strsz = dp->d_un.d_val; break; case DT_SYMTAB: ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr); break; case DT_SYMENT: if (dp->d_un.d_val != sizeof(Elf_Sym)) return (ENOEXEC); break; case DT_PLTGOT: ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr); break; case DT_REL: ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); break; case DT_RELSZ: ef->relsize = dp->d_un.d_val; break; case DT_RELENT: if (dp->d_un.d_val != sizeof(Elf_Rel)) return (ENOEXEC); break; case DT_JMPREL: ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); break; case DT_PLTRELSZ: ef->pltrelsize = dp->d_un.d_val; break; case DT_RELA: ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr); break; case DT_RELASZ: ef->relasize = dp->d_un.d_val; break; case DT_RELAENT: if (dp->d_un.d_val != sizeof(Elf_Rela)) return (ENOEXEC); break; case DT_PLTREL: plttype = dp->d_un.d_val; if (plttype != DT_REL && plttype != DT_RELA) return (ENOEXEC); break; #ifdef GDB case DT_DEBUG: dp->d_un.d_ptr = (Elf_Addr)&r_debug; break; #endif } } if (plttype == DT_RELA) { ef->pltrela = (const Elf_Rela *)ef->pltrel; ef->pltrel = NULL; ef->pltrelasize = ef->pltrelsize; ef->pltrelsize = 0; } ef->ddbsymtab = ef->symtab; ef->ddbsymcnt = ef->nchains; ef->ddbstrtab = ef->strtab; ef->ddbstrcnt = ef->strsz; return (0); } static int parse_dpcpu(elf_file_t ef) { int count; int error; ef->pcpu_start = 0; ef->pcpu_stop = 0; error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start, (void ***)&ef->pcpu_stop, &count); /* Error just means there is no pcpu set to relocate. */ if (error != 0) return (0); count *= sizeof(void *); /* * Allocate space in the primary pcpu area. Copy in our * initialization from the data section and then initialize * all per-cpu storage from that. */ ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(count); if (ef->pcpu_base == 0) return (ENOSPC); memcpy((void *)ef->pcpu_base, (void *)ef->pcpu_start, count); dpcpu_copy((void *)ef->pcpu_base, count); elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop, ef->pcpu_base); return (0); } #ifdef VIMAGE static int parse_vnet(elf_file_t ef) { int count; int error; ef->vnet_start = 0; ef->vnet_stop = 0; error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start, (void ***)&ef->vnet_stop, &count); /* Error just means there is no vnet data set to relocate. */ if (error != 0) return (0); count *= sizeof(void *); /* * Allocate space in the primary vnet area. Copy in our * initialization from the data section and then initialize * all per-vnet storage from that. */ ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(count); if (ef->vnet_base == 0) return (ENOSPC); memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, count); vnet_data_copy((void *)ef->vnet_base, count); elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop, ef->vnet_base); return (0); } #endif static int link_elf_link_preload(linker_class_t cls, const char* filename, linker_file_t *result) { Elf_Addr *ctors_addrp; Elf_Size *ctors_sizep; caddr_t modptr, baseptr, sizeptr, dynptr; char *type; elf_file_t ef; linker_file_t lf; int error; vm_offset_t dp; /* Look to see if we have the file preloaded */ modptr = preload_search_by_name(filename); if (modptr == NULL) return (ENOENT); type = (char *)preload_search_info(modptr, MODINFO_TYPE); baseptr = preload_search_info(modptr, MODINFO_ADDR); sizeptr = preload_search_info(modptr, MODINFO_SIZE); dynptr = preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_DYNAMIC); if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 && strcmp(type, "elf module") != 0)) return (EFTYPE); if (baseptr == NULL || sizeptr == NULL || dynptr == NULL) return (EINVAL); lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) return (ENOMEM); ef = (elf_file_t) lf; ef->preloaded = 1; ef->modptr = modptr; ef->address = *(caddr_t *)baseptr; #ifdef SPARSE_MAPPING ef->object = 0; #endif dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr; ef->dynamic = (Elf_Dyn *)dp; lf->address = ef->address; lf->size = *(size_t *)sizeptr; ctors_addrp = (Elf_Addr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_ADDR); ctors_sizep = (Elf_Size *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_SIZE); if (ctors_addrp != NULL && ctors_sizep != NULL) { lf->ctors_addr = ef->address + *ctors_addrp; lf->ctors_size = *ctors_sizep; } error = parse_dynamic(ef); if (error == 0) error = parse_dpcpu(ef); #ifdef VIMAGE if (error == 0) error = parse_vnet(ef); #endif if (error != 0) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } link_elf_reloc_local(lf); *result = lf; return (0); } static int link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; int error; ef = (elf_file_t) lf; error = relocate_file(ef); if (error != 0) return (error); (void)link_elf_preload_parse_symbols(ef); return (link_elf_link_common_finish(lf)); } static int link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result) { struct nameidata nd; struct thread* td = curthread; /* XXX */ Elf_Ehdr *hdr; caddr_t firstpage; int nbytes, i; Elf_Phdr *phdr; Elf_Phdr *phlimit; Elf_Phdr *segs[MAXSEGS]; int nsegs; Elf_Phdr *phdyn; Elf_Phdr *phphdr; caddr_t mapbase; size_t mapsize; Elf_Off base_offset; Elf_Addr base_vaddr; Elf_Addr base_vlimit; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; Elf_Shdr *shdr; int symtabindex; int symstrindex; int shstrindex; int symcnt; int strcnt; char *shstrs; shdr = NULL; lf = NULL; shstrs = NULL; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = ENOEXEC; firstpage = NULL; goto out; } #ifdef MAC error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp); if (error != 0) { firstpage = NULL; goto out; } #endif /* * Read the elf header from the file. */ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK); hdr = (Elf_Ehdr *)firstpage; error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); nbytes = PAGE_SIZE - resid; if (error != 0) goto out; if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } /* * We rely on the program header being in the first page. * This is not strictly required by the ABI specification, but * it seems to always true in practice. And, it simplifies * things considerably. */ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) link_elf_error(filename, "Unreadable program headers"); /* * Scan the program header entries, and save key information. * * We rely on there being exactly two load segments, text and data, * in that order. */ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); phlimit = phdr + hdr->e_phnum; nsegs = 0; phdyn = NULL; phphdr = NULL; while (phdr < phlimit) { switch (phdr->p_type) { case PT_LOAD: if (nsegs == MAXSEGS) { link_elf_error(filename, "Too many sections"); error = ENOEXEC; goto out; } /* * XXX: We just trust they come in right order ?? */ segs[nsegs] = phdr; ++nsegs; break; case PT_PHDR: phphdr = phdr; break; case PT_DYNAMIC: phdyn = phdr; break; case PT_INTERP: error = ENOSYS; goto out; } ++phdr; } if (phdyn == NULL) { link_elf_error(filename, "Object is not dynamically-linked"); error = ENOEXEC; goto out; } if (nsegs == 0) { link_elf_error(filename, "No sections"); error = ENOEXEC; goto out; } /* * Allocate the entire address space of the object, to stake * out our contiguous region, and to establish the base * address for relocation. */ base_offset = trunc_page(segs[0]->p_offset); base_vaddr = trunc_page(segs[0]->p_vaddr); base_vlimit = round_page(segs[nsegs - 1]->p_vaddr + segs[nsegs - 1]->p_memsz); mapsize = base_vlimit - base_vaddr; lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; #ifdef SPARSE_MAPPING ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } ef->address = (caddr_t) vm_map_min(kernel_map); error = vm_map_find(kernel_map, ef->object, 0, (vm_offset_t *) &ef->address, mapsize, 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != 0) { vm_object_deallocate(ef->object); ef->object = 0; goto out; } #else ef->address = malloc(mapsize, M_LINKER, M_WAITOK); #endif mapbase = ef->address; /* * Read the text and data sections and zero the bss. */ for (i = 0; i < nsegs; i++) { caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr; error = vn_rdwr(UIO_READ, nd.ni_vp, segbase, segs[i]->p_filesz, segs[i]->p_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; bzero(segbase + segs[i]->p_filesz, segs[i]->p_memsz - segs[i]->p_filesz); #ifdef SPARSE_MAPPING /* * Wire down the pages */ error = vm_map_wire(kernel_map, (vm_offset_t) segbase, (vm_offset_t) segbase + segs[i]->p_memsz, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } #endif } #ifdef GPROF /* Update profiling information with the new text segment. */ mtx_lock(&Giant); kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr + segs[0]->p_memsz)); mtx_unlock(&Giant); #endif ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); lf->address = ef->address; lf->size = mapsize; error = parse_dynamic(ef); if (error != 0) goto out; error = parse_dpcpu(ef); if (error != 0) goto out; #ifdef VIMAGE error = parse_vnet(ef); if (error != 0) goto out; #endif link_elf_reloc_local(lf); VOP_UNLOCK(nd.ni_vp, 0); error = linker_load_dependencies(lf); vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto out; error = relocate_file(ef); if (error != 0) goto out; /* * Try and load the symbol table if it's present. (you can * strip it!) */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0) goto nosyms; shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; /* Read section string table */ shstrindex = hdr->e_shstrndx; if (shstrindex != 0 && shdr[shstrindex].sh_type == SHT_STRTAB && shdr[shstrindex].sh_size != 0) { nbytes = shdr[shstrindex].sh_size; shstrs = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shstrs, nbytes, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; } symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_type == SHT_SYMTAB) { symtabindex = i; symstrindex = shdr[i].sh_link; } else if (shstrs != NULL && shdr[i].sh_name != 0 && strcmp(shstrs + shdr[i].sh_name, ".ctors") == 0) { /* Record relocated address and size of .ctors. */ lf->ctors_addr = mapbase + shdr[i].sh_addr - base_vaddr; lf->ctors_size = shdr[i].sh_size; } } if (symtabindex < 0 || symstrindex < 0) goto nosyms; symcnt = shdr[symtabindex].sh_size; ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK); strcnt = shdr[symstrindex].sh_size; ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->symbase, symcnt, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; error = vn_rdwr(UIO_READ, nd.ni_vp, ef->strbase, strcnt, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbsymtab = (const Elf_Sym *)ef->symbase; ef->ddbstrcnt = strcnt; ef->ddbstrtab = ef->strbase; nosyms: error = link_elf_link_common_finish(lf); if (error != 0) goto out; *result = lf; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (error != 0 && lf != NULL) linker_file_unload(lf, LINKER_UNLOAD_FORCE); free(shdr, M_LINKER); free(firstpage, M_LINKER); free(shstrs, M_LINKER); return (error); } Elf_Addr elf_relocaddr(linker_file_t lf, Elf_Addr x) { elf_file_t ef; ef = (elf_file_t)lf; if (x >= ef->pcpu_start && x < ef->pcpu_stop) return ((x - ef->pcpu_start) + ef->pcpu_base); #ifdef VIMAGE if (x >= ef->vnet_start && x < ef->vnet_stop) return ((x - ef->vnet_start) + ef->vnet_base); #endif return (x); } static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = (elf_file_t) file; if (ef->pcpu_base != 0) { dpcpu_free((void *)ef->pcpu_base, ef->pcpu_stop - ef->pcpu_start); elf_set_delete(&set_pcpu_list, ef->pcpu_start); } #ifdef VIMAGE if (ef->vnet_base != 0) { vnet_data_free((void *)ef->vnet_base, ef->vnet_stop - ef->vnet_start); elf_set_delete(&set_vnet_list, ef->vnet_start); } #endif #ifdef GDB if (ef->gdb.l_ld != NULL) { GDB_STATE(RT_DELETE); free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER); link_elf_delete_gdb(&ef->gdb); GDB_STATE(RT_CONSISTENT); } #endif /* Notify MD code that a module is being unloaded. */ elf_cpu_unload_file(file); if (ef->preloaded) { link_elf_unload_preload(file); return; } #ifdef SPARSE_MAPPING if (ef->object != NULL) { vm_map_remove(kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); } #else free(ef->address, M_LINKER); #endif free(ef->symbase, M_LINKER); free(ef->strbase, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); } static void link_elf_unload_preload(linker_file_t file) { if (file->filename != NULL) preload_delete_name(file->filename); } static const char * symbol_name(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->symtab + ELF_R_SYM(r_info); return (ef->strtab + ref->st_name); } return (NULL); } static int relocate_file(elf_file_t ef) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const char *symname; /* Perform relocations without addend if there are any: */ rel = ef->rel; if (rel != NULL) { rellim = (const Elf_Rel *) ((const char *)ef->rel + ef->relsize); while (rel < rellim) { if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel, ELF_RELOC_REL, elf_lookup)) { symname = symbol_name(ef, rel->r_info); printf("link_elf: symbol %s undefined\n", symname); return (ENOENT); } rel++; } } /* Perform relocations with addend if there are any: */ rela = ef->rela; if (rela != NULL) { relalim = (const Elf_Rela *) ((const char *)ef->rela + ef->relasize); while (rela < relalim) { if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela, ELF_RELOC_RELA, elf_lookup)) { symname = symbol_name(ef, rela->r_info); printf("link_elf: symbol %s undefined\n", symname); return (ENOENT); } rela++; } } /* Perform PLT relocations without addend if there are any: */ rel = ef->pltrel; if (rel != NULL) { rellim = (const Elf_Rel *) ((const char *)ef->pltrel + ef->pltrelsize); while (rel < rellim) { if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel, ELF_RELOC_REL, elf_lookup)) { symname = symbol_name(ef, rel->r_info); printf("link_elf: symbol %s undefined\n", symname); return (ENOENT); } rel++; } } /* Perform relocations with addend if there are any: */ rela = ef->pltrela; if (rela != NULL) { relalim = (const Elf_Rela *) ((const char *)ef->pltrela + ef->pltrelasize); while (rela < relalim) { if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela, ELF_RELOC_RELA, elf_lookup)) { symname = symbol_name(ef, rela->r_info); printf("link_elf: symbol %s undefined\n", symname); return (ENOENT); } rela++; } } return (0); } /* * Hash function for symbol table lookup. Don't even think about changing * this. It is specified by the System V ABI. */ static unsigned long elf_hash(const char *name) { const unsigned char *p = (const unsigned char *) name; unsigned long h = 0; unsigned long g; while (*p != '\0') { h = (h << 4) + *p++; if ((g = h & 0xf0000000) != 0) h ^= g >> 24; h &= ~g; } return (h); } static int link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym) { elf_file_t ef = (elf_file_t) lf; unsigned long symnum; const Elf_Sym* symp; const char *strp; unsigned long hash; int i; /* If we don't have a hash, bail. */ if (ef->buckets == NULL || ef->nbuckets == 0) { printf("link_elf_lookup_symbol: missing symbol hash table\n"); return (ENOENT); } /* First, search hashed global symbols */ hash = elf_hash(name); symnum = ef->buckets[hash % ef->nbuckets]; while (symnum != STN_UNDEF) { if (symnum >= ef->nchains) { printf("%s: corrupt symbol table\n", __func__); return (ENOENT); } symp = ef->symtab + symnum; if (symp->st_name == 0) { printf("%s: corrupt symbol table\n", __func__); return (ENOENT); } strp = ef->strtab + symp->st_name; if (strcmp(name, strp) == 0) { if (symp->st_shndx != SHN_UNDEF || (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { *sym = (c_linker_sym_t) symp; return (0); } return (ENOENT); } symnum = ef->chains[symnum]; } /* If we have not found it, look at the full table (if loaded) */ if (ef->symtab == ef->ddbsymtab) return (ENOENT); /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { strp = ef->ddbstrtab + symp->st_name; if (strcmp(name, strp) == 0) { if (symp->st_shndx != SHN_UNDEF || (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { *sym = (c_linker_sym_t) symp; return (0); } return (ENOENT); } } return (ENOENT); } static int link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { elf_file_t ef = (elf_file_t) lf; const Elf_Sym* es = (const Elf_Sym*) sym; if (es >= ef->symtab && es < (ef->symtab + ef->nchains)) { symval->name = ef->strtab + es->st_name; symval->value = (caddr_t) ef->address + es->st_value; symval->size = es->st_size; return (0); } if (ef->symtab == ef->ddbsymtab) return (ENOENT); if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) { symval->name = ef->ddbstrtab + es->st_name; symval->value = (caddr_t) ef->address + es->st_value; symval->size = es->st_size; return (0); } return (ENOENT); } static int link_elf_search_symbol(linker_file_t lf, caddr_t value, c_linker_sym_t *sym, long *diffp) { elf_file_t ef = (elf_file_t) lf; u_long off = (uintptr_t) (void *) value; u_long diff = off; u_long st_value; const Elf_Sym* es; const Elf_Sym* best = NULL; int i; for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { if (es->st_name == 0) continue; st_value = es->st_value + (uintptr_t) (void *) ef->address; if (off >= st_value) { if (off - st_value < diff) { diff = off - st_value; best = es; if (diff == 0) break; } else if (off - st_value == diff) { best = es; } } } if (best == NULL) *diffp = off; else *diffp = diff; *sym = (c_linker_sym_t) best; return (0); } /* * Look up a linker set on an ELF system. */ static int link_elf_lookup_set(linker_file_t lf, const char *name, void ***startp, void ***stopp, int *countp) { c_linker_sym_t sym; linker_symval_t symval; char *setsym; void **start, **stop; int len, error = 0, count; len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */ setsym = malloc(len, M_LINKER, M_WAITOK); /* get address of first entry */ snprintf(setsym, len, "%s%s", "__start_set_", name); error = link_elf_lookup_symbol(lf, setsym, &sym); if (error != 0) goto out; link_elf_symbol_values(lf, sym, &symval); if (symval.value == 0) { error = ESRCH; goto out; } start = (void **)symval.value; /* get address of last entry */ snprintf(setsym, len, "%s%s", "__stop_set_", name); error = link_elf_lookup_symbol(lf, setsym, &sym); if (error != 0) goto out; link_elf_symbol_values(lf, sym, &symval); if (symval.value == 0) { error = ESRCH; goto out; } stop = (void **)symval.value; /* and the number of entries */ count = stop - start; /* and copy out */ if (startp != NULL) *startp = start; if (stopp != NULL) *stopp = stop; if (countp != NULL) *countp = count; out: free(setsym, M_LINKER); return (error); } static int link_elf_each_function_name(linker_file_t file, int (*callback)(const char *, void *), void *opaque) { elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC) { error = callback(ef->ddbstrtab + symp->st_name, opaque); if (error != 0) return (error); } } return (0); } static int link_elf_each_function_nameval(linker_file_t file, linker_function_nameval_callback_t callback, void *opaque) { linker_symval_t symval; elf_file_t ef = (elf_file_t)file; const Elf_Sym* symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC) { error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval); if (error != 0) return (error); error = callback(file, i, &symval, opaque); if (error != 0) return (error); } } return (0); } const Elf_Sym * elf_get_sym(linker_file_t lf, Elf_Size symidx) { elf_file_t ef = (elf_file_t)lf; if (symidx >= ef->nchains) return (NULL); return (ef->symtab + symidx); } const char * elf_get_symname(linker_file_t lf, Elf_Size symidx) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *sym; if (symidx >= ef->nchains) return (NULL); sym = ef->symtab + symidx; return (ef->strtab + sym->st_name); } /* * Symbol lookup function that can be used when the symbol index is known (ie * in relocations). It uses the symbol index instead of doing a fully fledged * hash table based lookup when such is valid. For example for local symbols. * This is not only more efficient, it's also more correct. It's not always * the case that the symbol can be found through the hash table. */ static int elf_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *sym; const char *symbol; Elf_Addr addr, start, base; /* Don't even try to lookup the symbol if the index is bogus. */ if (symidx >= ef->nchains) { *res = 0; return (EINVAL); } sym = ef->symtab + symidx; /* * Don't do a full lookup when the symbol is local. It may even * fail because it may not be found through the hash table. */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) { /* Force lookup failure when we have an insanity. */ if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) { *res = 0; return (EINVAL); } *res = ((Elf_Addr)ef->address + sym->st_value); return (0); } /* * XXX we can avoid doing a hash table based lookup for global * symbols as well. This however is not always valid, so we'll * just do it the hard way for now. Performance tweaks can * always be added. */ symbol = ef->strtab + sym->st_name; /* Force a lookup failure if the symbol name is bogus. */ if (*symbol == 0) { *res = 0; return (EINVAL); } addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps)); if (addr == 0 && ELF_ST_BIND(sym->st_info) != STB_WEAK) { *res = 0; return (EINVAL); } if (elf_set_find(&set_pcpu_list, addr, &start, &base)) addr = addr - start + base; #ifdef VIMAGE else if (elf_set_find(&set_vnet_list, addr, &start, &base)) addr = addr - start + base; #endif *res = addr; return (0); } static void link_elf_reloc_local(linker_file_t lf) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; elf_file_t ef = (elf_file_t)lf; /* Perform relocations without addend if there are any: */ if ((rel = ef->rel) != NULL) { rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize); while (rel < rellim) { elf_reloc_local(lf, (Elf_Addr)ef->address, rel, ELF_RELOC_REL, elf_lookup); rel++; } } /* Perform relocations with addend if there are any: */ if ((rela = ef->rela) != NULL) { relalim = (const Elf_Rela *) ((const char *)ef->rela + ef->relasize); while (rela < relalim) { elf_reloc_local(lf, (Elf_Addr)ef->address, rela, ELF_RELOC_RELA, elf_lookup); rela++; } } } static long link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab) { elf_file_t ef = (elf_file_t)lf; *symtab = ef->ddbsymtab; if (*symtab == NULL) return (0); return (ef->ddbsymcnt); } static long link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) { elf_file_t ef = (elf_file_t)lf; *strtab = ef->ddbstrtab; if (*strtab == NULL) return (0); return (ef->ddbstrcnt); } Index: head/sys/kern/link_elf_obj.c =================================================================== --- head/sys/kern/link_elf_obj.c (revision 326270) +++ head/sys/kern/link_elf_obj.c (revision 326271) @@ -1,1518 +1,1520 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1998-2000 Doug Rabson * Copyright (c) 2004 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB_CTF #include #endif #include "linker_if.h" typedef struct { void *addr; Elf_Off size; int flags; int sec; /* Original section */ char *name; } Elf_progent; typedef struct { Elf_Rel *rel; int nrel; int sec; } Elf_relent; typedef struct { Elf_Rela *rela; int nrela; int sec; } Elf_relaent; typedef struct elf_file { struct linker_file lf; /* Common fields */ int preloaded; caddr_t address; /* Relocation address */ vm_object_t object; /* VM object to hold file pages */ Elf_Shdr *e_shdr; Elf_progent *progtab; int nprogtab; Elf_relaent *relatab; int nrelatab; Elf_relent *reltab; int nreltab; Elf_Sym *ddbsymtab; /* The symbol table we are using */ long ddbsymcnt; /* Number of symbols */ caddr_t ddbstrtab; /* String table */ long ddbstrcnt; /* number of bytes in string table */ caddr_t shstrtab; /* Section name string table */ long shstrcnt; /* number of bytes in string table */ caddr_t ctftab; /* CTF table */ long ctfcnt; /* number of bytes in CTF table */ caddr_t ctfoff; /* CTF offset table */ caddr_t typoff; /* Type offset table */ long typlen; /* Number of type entries. */ } *elf_file_t; #include static int link_elf_link_preload(linker_class_t cls, const char *, linker_file_t *); static int link_elf_link_preload_finish(linker_file_t); static int link_elf_load_file(linker_class_t, const char *, linker_file_t *); static int link_elf_lookup_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_search_symbol(linker_file_t, caddr_t value, c_linker_sym_t *sym, long *diffp); static void link_elf_unload_file(linker_file_t); static int link_elf_lookup_set(linker_file_t, const char *, void ***, void ***, int *); static int link_elf_each_function_name(linker_file_t, int (*)(const char *, void *), void *); static int link_elf_each_function_nameval(linker_file_t, linker_function_nameval_callback_t, void *); static int link_elf_reloc_local(linker_file_t); static long link_elf_symtab_get(linker_file_t, const Elf_Sym **); static long link_elf_strtab_get(linker_file_t, caddr_t *); static int elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *); static kobj_method_t link_elf_methods[] = { KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), KOBJMETHOD(linker_unload, link_elf_unload_file), KOBJMETHOD(linker_load_file, link_elf_load_file), KOBJMETHOD(linker_link_preload, link_elf_link_preload), KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval), KOBJMETHOD(linker_ctf_get, link_elf_ctf_get), KOBJMETHOD(linker_symtab_get, link_elf_symtab_get), KOBJMETHOD(linker_strtab_get, link_elf_strtab_get), { 0, 0 } }; static struct linker_class link_elf_class = { #if ELF_TARG_CLASS == ELFCLASS32 "elf32_obj", #else "elf64_obj", #endif link_elf_methods, sizeof(struct elf_file) }; static int relocate_file(elf_file_t ef); static void elf_obj_cleanup_globals_cache(elf_file_t); static void link_elf_error(const char *filename, const char *s) { if (filename == NULL) printf("kldload: %s\n", s); else printf("kldload: %s: %s\n", filename, s); } static void link_elf_init(void *arg) { linker_add_class(&link_elf_class); } SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0); static int link_elf_link_preload(linker_class_t cls, const char *filename, linker_file_t *result) { Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; void *modptr, *baseptr, *sizeptr; char *type; elf_file_t ef; linker_file_t lf; Elf_Addr off; int error, i, j, pb, ra, rl, shstrindex, symstrindex, symtabindex; /* Look to see if we have the file preloaded */ modptr = preload_search_by_name(filename); if (modptr == NULL) return ENOENT; type = (char *)preload_search_info(modptr, MODINFO_TYPE); baseptr = preload_search_info(modptr, MODINFO_ADDR); sizeptr = preload_search_info(modptr, MODINFO_SIZE); hdr = (Elf_Ehdr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_ELFHDR); shdr = (Elf_Shdr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_SHDR); if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " obj module") != 0 && strcmp(type, "elf obj module") != 0)) { return (EFTYPE); } if (baseptr == NULL || sizeptr == NULL || hdr == NULL || shdr == NULL) return (EINVAL); lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) return (ENOMEM); ef = (elf_file_t)lf; ef->preloaded = 1; ef->address = *(caddr_t *)baseptr; lf->address = *(caddr_t *)baseptr; lf->size = *(size_t *)sizeptr; if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT || hdr->e_type != ET_REL || hdr->e_machine != ELF_TARG_MACH) { error = EFTYPE; goto out; } ef->e_shdr = shdr; /* Scan the section header for information and table sizing. */ symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif ef->nprogtab++; break; case SHT_SYMTAB: symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; } } shstrindex = hdr->e_shstrndx; if (ef->nprogtab == 0 || symstrindex < 0 || symstrindex >= hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB || shstrindex == 0 || shstrindex >= hdr->e_shnum || shdr[shstrindex].sh_type != SHT_STRTAB) { printf("%s: bad/missing section headers\n", filename); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if ((ef->nprogtab != 0 && ef->progtab == NULL) || (ef->nreltab != 0 && ef->reltab == NULL) || (ef->nrelatab != 0 && ef->relatab == NULL)) { error = ENOMEM; goto out; } /* XXX, relocate the sh_addr fields saved by the loader. */ off = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_addr != 0 && (off == 0 || shdr[i].sh_addr < off)) off = shdr[i].sh_addr; } for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_addr != 0) shdr[i].sh_addr = shdr[i].sh_addr - off + (Elf_Addr)ef->address; } ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = (Elf_Sym *)shdr[symtabindex].sh_addr; ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = (char *)shdr[symstrindex].sh_addr; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = (char *)shdr[shstrindex].sh_addr; /* Now fill out progtab and the relocation tables. */ pb = 0; rl = 0; ra = 0; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif ef->progtab[pb].addr = (void *)shdr[i].sh_addr; if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<>"; #ifdef __amd64__ else if (shdr[i].sh_type == SHT_X86_64_UNWIND) ef->progtab[pb].name = "<>"; #endif else ef->progtab[pb].name = "<>"; ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) { void *dpcpu; dpcpu = dpcpu_alloc(shdr[i].sh_size); if (dpcpu == NULL) { error = ENOSPC; goto out; } memcpy(dpcpu, ef->progtab[pb].addr, ef->progtab[pb].size); dpcpu_copy(dpcpu, shdr[i].sh_size); ef->progtab[pb].addr = dpcpu; #ifdef VIMAGE } else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) { void *vnet_data; vnet_data = vnet_data_alloc(shdr[i].sh_size); if (vnet_data == NULL) { error = ENOSPC; goto out; } memcpy(vnet_data, ef->progtab[pb].addr, ef->progtab[pb].size); vnet_data_copy(vnet_data, shdr[i].sh_size); ef->progtab[pb].addr = vnet_data; #endif } else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, ".ctors")) { lf->ctors_addr = ef->progtab[pb].addr; lf->ctors_size = shdr[i].sh_size; } /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } pb++; break; case SHT_REL: ef->reltab[rl].rel = (Elf_Rel *)shdr[i].sh_addr; ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; rl++; break; case SHT_RELA: ef->relatab[ra].rela = (Elf_Rela *)shdr[i].sh_addr; ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; ra++; break; } } if (pb != ef->nprogtab) { printf("%s: lost progbits\n", filename); error = ENOEXEC; goto out; } if (rl != ef->nreltab) { printf("%s: lost reltab\n", filename); error = ENOEXEC; goto out; } if (ra != ef->nrelatab) { printf("%s: lost relatab\n", filename); error = ENOEXEC; goto out; } /* Local intra-module relocations */ error = link_elf_reloc_local(lf); if (error != 0) goto out; *result = lf; return (0); out: /* preload not done this way */ linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } static void link_elf_invoke_ctors(caddr_t addr, size_t size) { void (**ctor)(void); size_t i, cnt; if (addr == NULL || size == 0) return; cnt = size / sizeof(*ctor); ctor = (void *)addr; for (i = 0; i < cnt; i++) { if (ctor[i] != NULL) (*ctor[i])(); } } static int link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; int error; ef = (elf_file_t)lf; error = relocate_file(ef); if (error) return error; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) return (error); /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); return (0); } static int link_elf_load_file(linker_class_t cls, const char *filename, linker_file_t *result) { struct nameidata *nd; struct thread *td = curthread; /* XXX */ Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; nd = malloc(sizeof(struct nameidata), M_TEMP, M_WAITOK); NDINIT(nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); flags = FREAD; error = vn_open(nd, &flags, 0, NULL); if (error) { free(nd, M_TEMP); return error; } NDFREE(nd, NDF_ONLY_PNBUF); if (nd->ni_vp->v_type != VREG) { error = ENOEXEC; goto out; } #ifdef MAC error = mac_kld_check_load(td->td_ucred, nd->ni_vp); if (error) { goto out; } #endif /* Read the elf header from the file. */ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } lf = linker_make_file(filename, &link_elf_class); if (!lf) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; ef->nprogtab = 0; ef->e_shdr = 0; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = malloc(nbytes, M_LINKER, M_WAITOK); ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, nd->ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_error(filename, "file has no valid symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if (symtabindex == -1) { link_elf_error(filename, "lost symbol table index"); error = ENOEXEC; goto out; } /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } if (symstrindex == -1) { link_elf_error(filename, "lost symbol string index"); error = ENOEXEC; goto out; } /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd->ni_vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd->ni_vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. * This stuff needs to be in a single chunk so that profiling etc * can get the bounds and gdb can associate offsets with modules */ ef->object = vm_object_allocate(OBJT_DEFAULT, round_page(mapsize) >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } ef->address = (caddr_t) vm_map_min(kernel_map); /* * In order to satisfy amd64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. */ #ifdef __amd64__ mapbase = KERNBASE; #else mapbase = VM_MIN_KERNEL_ADDRESS; #endif error = vm_map_find(kernel_map, ef->object, 0, &mapbase, round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_object_deallocate(ef->object); ef->object = 0; goto out; } /* Wire the pages */ error = vm_map_wire(kernel_map, mapbase, mapbase + round_page(mapsize), VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t)mapbase; lf->size = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab != NULL && shdr[i].sh_name != 0) { ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; if (!strcmp(ef->progtab[pb].name, ".ctors")) { lf->ctors_addr = (caddr_t)mapbase; lf->ctors_size = shdr[i].sh_size; } } else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<>"; #ifdef __amd64__ else if (shdr[i].sh_type == SHT_X86_64_UNWIND) ef->progtab[pb].name = "<>"; #endif else ef->progtab[pb].name = "<>"; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); #endif else ef->progtab[pb].addr = (void *)(uintptr_t)mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS #ifdef __amd64__ || shdr[i].sh_type == SHT_X86_64_UNWIND #endif ) { error = vn_rdwr(UIO_READ, nd->ni_vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Initialize the per-cpu or vnet area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) vnet_data_copy(ef->progtab[pb].addr, shdr[i].sh_size); #endif } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } rl++; break; case SHT_RELA: ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) { link_elf_error(filename, "lost progbits"); error = ENOEXEC; goto out; } if (rl != ef->nreltab) { link_elf_error(filename, "lost reltab"); error = ENOEXEC; goto out; } if (ra != ef->nrelatab) { link_elf_error(filename, "lost relatab"); error = ENOEXEC; goto out; } if (mapbase != (vm_offset_t)ef->address + mapsize) { printf( "%s: mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n", filename != NULL ? filename : "", (u_long)mapbase, ef->address, (u_long)mapsize, (u_long)(vm_offset_t)ef->address + mapsize); error = ENOMEM; goto out; } /* Local intra-module relocations */ error = link_elf_reloc_local(lf); if (error != 0) goto out; /* Pull in dependencies */ VOP_UNLOCK(nd->ni_vp, 0); error = linker_load_dependencies(lf); vn_lock(nd->ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto out; /* External relocations */ error = relocate_file(ef); if (error) goto out; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) goto out; /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); *result = lf; out: VOP_UNLOCK(nd->ni_vp, 0); vn_close(nd->ni_vp, FREAD, td->td_ucred, td); free(nd, M_TEMP); if (error && lf) linker_file_unload(lf, LINKER_UNLOAD_FORCE); free(hdr, M_LINKER); return error; } static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = (elf_file_t) file; int i; /* Notify MD code that a module is being unloaded. */ elf_cpu_unload_file(file); if (ef->progtab) { for (i = 0; i < ef->nprogtab; i++) { if (ef->progtab[i].size == 0) continue; if (ef->progtab[i].name == NULL) continue; if (!strcmp(ef->progtab[i].name, DPCPU_SETNAME)) dpcpu_free(ef->progtab[i].addr, ef->progtab[i].size); #ifdef VIMAGE else if (!strcmp(ef->progtab[i].name, VNET_SETNAME)) vnet_data_free(ef->progtab[i].addr, ef->progtab[i].size); #endif } } if (ef->preloaded) { free(ef->reltab, M_LINKER); free(ef->relatab, M_LINKER); free(ef->progtab, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); if (file->filename != NULL) preload_delete_name(file->filename); /* XXX reclaim module memory? */ return; } for (i = 0; i < ef->nreltab; i++) free(ef->reltab[i].rel, M_LINKER); for (i = 0; i < ef->nrelatab; i++) free(ef->relatab[i].rela, M_LINKER); free(ef->reltab, M_LINKER); free(ef->relatab, M_LINKER); free(ef->progtab, M_LINKER); if (ef->object) { vm_map_remove(kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); } free(ef->e_shdr, M_LINKER); free(ef->ddbsymtab, M_LINKER); free(ef->ddbstrtab, M_LINKER); free(ef->shstrtab, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); } static const char * symbol_name(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->ddbsymtab + ELF_R_SYM(r_info); return ef->ddbstrtab + ref->st_name; } else return NULL; } static Elf_Addr findbase(elf_file_t ef, int sec) { int i; Elf_Addr base = 0; for (i = 0; i < ef->nprogtab; i++) { if (sec == ef->progtab[i].sec) { base = (Elf_Addr)ef->progtab[i].addr; break; } } return base; } static int relocate_file(elf_file_t ef) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const char *symname; const Elf_Sym *sym; int i; Elf_Size symidx; Elf_Addr base; /* Perform relocations without addend if there are any: */ for (i = 0; i < ef->nreltab; i++) { rel = ef->reltab[i].rel; if (rel == NULL) { link_elf_error(ef->lf.filename, "lost a reltab!"); return (ENOEXEC); } rellim = rel + ef->reltab[i].nrel; base = findbase(ef, ef->reltab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rel < rellim; rel++) { symidx = ELF_R_SYM(rel->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Local relocs are already done */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) continue; if (elf_reloc(&ef->lf, base, rel, ELF_RELOC_REL, elf_obj_lookup)) { symname = symbol_name(ef, rel->r_info); printf("link_elf_obj: symbol %s undefined\n", symname); return (ENOENT); } } } /* Perform relocations with addend if there are any: */ for (i = 0; i < ef->nrelatab; i++) { rela = ef->relatab[i].rela; if (rela == NULL) { link_elf_error(ef->lf.filename, "lost a relatab!"); return (ENOEXEC); } relalim = rela + ef->relatab[i].nrela; base = findbase(ef, ef->relatab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for relatab"); return (ENOEXEC); } for ( ; rela < relalim; rela++) { symidx = ELF_R_SYM(rela->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Local relocs are already done */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) continue; if (elf_reloc(&ef->lf, base, rela, ELF_RELOC_RELA, elf_obj_lookup)) { symname = symbol_name(ef, rela->r_info); printf("link_elf_obj: symbol %s undefined\n", symname); return (ENOENT); } } } /* * Only clean SHN_FBSD_CACHED for successful return. If we * modified symbol table for the object but found an * unresolved symbol, there is no reason to roll back. */ elf_obj_cleanup_globals_cache(ef); return (0); } static int link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym) { elf_file_t ef = (elf_file_t) lf; const Elf_Sym *symp; const char *strp; int i; for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { strp = ef->ddbstrtab + symp->st_name; if (symp->st_shndx != SHN_UNDEF && strcmp(name, strp) == 0) { *sym = (c_linker_sym_t) symp; return 0; } } return ENOENT; } static int link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { elf_file_t ef = (elf_file_t) lf; const Elf_Sym *es = (const Elf_Sym*) sym; if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) { symval->name = ef->ddbstrtab + es->st_name; symval->value = (caddr_t)es->st_value; symval->size = es->st_size; return 0; } return ENOENT; } static int link_elf_search_symbol(linker_file_t lf, caddr_t value, c_linker_sym_t *sym, long *diffp) { elf_file_t ef = (elf_file_t) lf; u_long off = (uintptr_t) (void *) value; u_long diff = off; u_long st_value; const Elf_Sym *es; const Elf_Sym *best = NULL; int i; for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { if (es->st_name == 0) continue; st_value = es->st_value; if (off >= st_value) { if (off - st_value < diff) { diff = off - st_value; best = es; if (diff == 0) break; } else if (off - st_value == diff) { best = es; } } } if (best == NULL) *diffp = off; else *diffp = diff; *sym = (c_linker_sym_t) best; return 0; } /* * Look up a linker set on an ELF system. */ static int link_elf_lookup_set(linker_file_t lf, const char *name, void ***startp, void ***stopp, int *countp) { elf_file_t ef = (elf_file_t)lf; void **start, **stop; int i, count; /* Relative to section number */ for (i = 0; i < ef->nprogtab; i++) { if ((strncmp(ef->progtab[i].name, "set_", 4) == 0) && strcmp(ef->progtab[i].name + 4, name) == 0) { start = (void **)ef->progtab[i].addr; stop = (void **)((char *)ef->progtab[i].addr + ef->progtab[i].size); count = stop - start; if (startp) *startp = start; if (stopp) *stopp = stop; if (countp) *countp = count; return (0); } } return (ESRCH); } static int link_elf_each_function_name(linker_file_t file, int (*callback)(const char *, void *), void *opaque) { elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC) { error = callback(ef->ddbstrtab + symp->st_name, opaque); if (error) return (error); } } return (0); } static int link_elf_each_function_nameval(linker_file_t file, linker_function_nameval_callback_t callback, void *opaque) { linker_symval_t symval; elf_file_t ef = (elf_file_t)file; const Elf_Sym* symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC) { error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval); if (error) return (error); error = callback(file, i, &symval, opaque); if (error) return (error); } } return (0); } static void elf_obj_cleanup_globals_cache(elf_file_t ef) { Elf_Sym *sym; Elf_Size i; for (i = 0; i < ef->ddbsymcnt; i++) { sym = ef->ddbsymtab + i; if (sym->st_shndx == SHN_FBSD_CACHED) { sym->st_shndx = SHN_UNDEF; sym->st_value = 0; } } } /* * Symbol lookup function that can be used when the symbol index is known (ie * in relocations). It uses the symbol index instead of doing a fully fledged * hash table based lookup when such is valid. For example for local symbols. * This is not only more efficient, it's also more correct. It's not always * the case that the symbol can be found through the hash table. */ static int elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res) { elf_file_t ef = (elf_file_t)lf; Elf_Sym *sym; const char *symbol; Elf_Addr res1; /* Don't even try to lookup the symbol if the index is bogus. */ if (symidx >= ef->ddbsymcnt) { *res = 0; return (EINVAL); } sym = ef->ddbsymtab + symidx; /* Quick answer if there is a definition included. */ if (sym->st_shndx != SHN_UNDEF) { *res = sym->st_value; return (0); } /* If we get here, then it is undefined and needs a lookup. */ switch (ELF_ST_BIND(sym->st_info)) { case STB_LOCAL: /* Local, but undefined? huh? */ *res = 0; return (EINVAL); case STB_GLOBAL: case STB_WEAK: /* Relative to Data or Function name */ symbol = ef->ddbstrtab + sym->st_name; /* Force a lookup failure if the symbol name is bogus. */ if (*symbol == 0) { *res = 0; return (EINVAL); } res1 = (Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps); /* * Cache global lookups during module relocation. The failure * case is particularly expensive for callers, who must scan * through the entire globals table doing strcmp(). Cache to * avoid doing such work repeatedly. * * After relocation is complete, undefined globals will be * restored to SHN_UNDEF in elf_obj_cleanup_globals_cache(), * above. */ if (res1 != 0) { sym->st_shndx = SHN_FBSD_CACHED; sym->st_value = res1; *res = res1; return (0); } else if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { sym->st_value = 0; *res = 0; return (0); } return (EINVAL); default: return (EINVAL); } } static void link_elf_fix_link_set(elf_file_t ef) { static const char startn[] = "__start_"; static const char stopn[] = "__stop_"; Elf_Sym *sym; const char *sym_name, *linkset_name; Elf_Addr startp, stopp; Elf_Size symidx; int start, i; startp = stopp = 0; for (symidx = 1 /* zero entry is special */; symidx < ef->ddbsymcnt; symidx++) { sym = ef->ddbsymtab + symidx; if (sym->st_shndx != SHN_UNDEF) continue; sym_name = ef->ddbstrtab + sym->st_name; if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) { start = 1; linkset_name = sym_name + sizeof(startn) - 1; } else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) { start = 0; linkset_name = sym_name + sizeof(stopn) - 1; } else continue; for (i = 0; i < ef->nprogtab; i++) { if (strcmp(ef->progtab[i].name, linkset_name) == 0) { startp = (Elf_Addr)ef->progtab[i].addr; stopp = (Elf_Addr)(startp + ef->progtab[i].size); break; } } if (i == ef->nprogtab) continue; sym->st_value = start ? startp : stopp; sym->st_shndx = i; } } static int link_elf_reloc_local(linker_file_t lf) { elf_file_t ef = (elf_file_t)lf; const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const Elf_Sym *sym; Elf_Addr base; int i; Elf_Size symidx; link_elf_fix_link_set(ef); /* Perform relocations without addend if there are any: */ for (i = 0; i < ef->nreltab; i++) { rel = ef->reltab[i].rel; if (rel == NULL) { link_elf_error(ef->lf.filename, "lost a reltab"); return (ENOEXEC); } rellim = rel + ef->reltab[i].nrel; base = findbase(ef, ef->reltab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rel < rellim; rel++) { symidx = ELF_R_SYM(rel->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Only do local relocs */ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL) continue; elf_reloc_local(lf, base, rel, ELF_RELOC_REL, elf_obj_lookup); } } /* Perform relocations with addend if there are any: */ for (i = 0; i < ef->nrelatab; i++) { rela = ef->relatab[i].rela; if (rela == NULL) { link_elf_error(ef->lf.filename, "lost a relatab!"); return (ENOEXEC); } relalim = rela + ef->relatab[i].nrela; base = findbase(ef, ef->relatab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rela < relalim; rela++) { symidx = ELF_R_SYM(rela->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Only do local relocs */ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL) continue; elf_reloc_local(lf, base, rela, ELF_RELOC_RELA, elf_obj_lookup); } } return (0); } static long link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab) { elf_file_t ef = (elf_file_t)lf; *symtab = ef->ddbsymtab; if (*symtab == NULL) return (0); return (ef->ddbsymcnt); } static long link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) { elf_file_t ef = (elf_file_t)lf; *strtab = ef->ddbstrtab; if (*strtab == NULL) return (0); return (ef->ddbstrcnt); } Index: head/sys/kern/sched_ule.c =================================================================== --- head/sys/kern/sched_ule.c (revision 326270) +++ head/sys/kern/sched_ule.c (revision 326271) @@ -1,2956 +1,2958 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002-2007, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file implements the ULE scheduler. ULE supports independent CPU * run queues and fine grain locking. It has superior interactive * performance under load even on uni-processor systems. * * etymology: * ULE is the last three letters in schedule. It owes its name to a * generic user created for a scheduling system by Paul Mikesell at * Isilon Systems and a general lack of creativity on the part of the author. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include int dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif #include #include #define KTR_ULE 0 #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) #define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU))) #define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load")) /* * Thread scheduler specific section. All fields are protected * by the thread lock. */ struct td_sched { struct runq *ts_runq; /* Run-queue we're queued on. */ short ts_flags; /* TSF_* flags. */ int ts_cpu; /* CPU that we have affinity for. */ int ts_rltick; /* Real last tick, for affinity. */ int ts_slice; /* Ticks of slice remaining. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ int ts_ticks; /* Tick count */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size"); /* * Priority ranges used for interactive and non-interactive timeshare * threads. The timeshare priorities are split up into four ranges. * The first range handles interactive threads. The last three ranges * (NHALF, x, and NHALF) handle non-interactive threads with the outer * ranges supporting nice values. */ #define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2) #define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE) #define PRI_MIN_INTERACT PRI_MIN_TIMESHARE #define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1) #define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE) #define PRI_MAX_BATCH PRI_MAX_TIMESHARE /* * Cpu percentage computation macros and defines. * * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. * SCHED_TICK_MAX: Maximum number of ticks before scaling back. * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. */ #define SCHED_TICK_SECS 10 #define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) #define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) #define SCHED_TICK_SHIFT 10 #define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) #define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) /* * These macros determine priorities for non-interactive threads. They are * assigned a priority based on their recent cpu utilization as expressed * by the ratio of ticks to the tick total. NHALF priorities at the start * and end of the MIN to MAX timeshare range are only reachable with negative * or positive nice respectively. * * PRI_RANGE: Priority range for utilization dependent priorities. * PRI_NRESV: Number of nice values. * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. * PRI_NICE: Determines the part of the priority inherited from nice. */ #define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) #define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF) #define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF) #define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) #define SCHED_PRI_TICKS(ts) \ (SCHED_TICK_HZ((ts)) / \ (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) #define SCHED_PRI_NICE(nice) (nice) /* * These determine the interactivity of a process. Interactivity differs from * cpu utilization in that it expresses the voluntary time slept vs time ran * while cpu utilization includes all time not running. This more accurately * models the intent of the thread. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters determine the slice behavior for batch work. */ #define SCHED_SLICE_DEFAULT_DIVISOR 10 /* ~94 ms, 12 stathz ticks. */ #define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */ /* Flags kept in td_flags. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* * tickincr: Converts a stathz tick into a hz domain scaled by * the shift factor. Without the shift the error rate * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. * sched_slice: Runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. */ static int sched_interact = SCHED_INTERACT_THRESH; static int tickincr = 8 << SCHED_TICK_SHIFT; static int realstathz = 127; /* reset during boot. */ static int sched_slice = 10; /* reset during boot. */ static int sched_slice_min = 1; /* reset during boot. */ #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int preempt_thresh = PRI_MAX_IDLE; #else static int preempt_thresh = PRI_MIN_KERN; #endif #else static int preempt_thresh = 0; #endif static int static_boost = PRI_MIN_BATCH; static int sched_idlespins = 10000; static int sched_idlespinthresh = -1; /* * tdq - per processor runqs and statistics. All fields are protected by the * tdq_lock. The load and lowpri may be accessed without to avoid excess * locking in sched_pickcpu(); */ struct tdq { /* * Ordered to improve efficiency of cpu_search() and switch(). * tdq_lock is padded to avoid false sharing with tdq_load and * tdq_cpu_idle. */ struct mtx_padalign tdq_lock; /* run queue lock. */ struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ volatile int tdq_load; /* Aggregate load. */ volatile int tdq_cpu_idle; /* cpu_idle() is active. */ int tdq_sysload; /* For loadavg, !ITHD load. */ int tdq_transferable; /* Transferable thread count. */ short tdq_switchcnt; /* Switches this tick. */ short tdq_oldswitchcnt; /* Switches last tick. */ u_char tdq_lowpri; /* Lowest priority thread. */ u_char tdq_ipipending; /* IPI pending. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ char tdq_name[TDQ_NAME_LEN]; #ifdef KTR char tdq_loadname[TDQ_LOADNAME_LEN]; #endif } __aligned(64); /* Idle thread states and config. */ #define TDQ_RUNNING 1 #define TDQ_IDLE 2 #ifdef SMP struct cpu_group *cpu_top; /* CPU topology */ #define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) #define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. */ static int rebalance = 1; static int balance_interval = 128; /* Default set in sched_initticks(). */ static int affinity; static int steal_idle = 1; static int steal_thresh = 2; /* * One thread queue per processor. */ static struct tdq tdq_cpu[MAXCPU]; static struct tdq *balance_tdq; static int balance_ticks; static DPCPU_DEFINE(uint32_t, randomval); #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) #define TDQ_CPU(x) (&tdq_cpu[(x)]) #define TDQ_ID(x) ((int)((x) - tdq_cpu)) #else /* !SMP */ static struct tdq tdq_cpu; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) #define TDQ_CPU(x) (&tdq_cpu) #endif #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock)) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); static int sched_interact_score(struct thread *); static void sched_interact_update(struct thread *); static void sched_interact_fork(struct thread *); static void sched_pctcpu_update(struct td_sched *, int); /* Operations on per processor queues */ static struct thread *tdq_choose(struct tdq *); static void tdq_setup(struct tdq *); static void tdq_load_add(struct tdq *, struct thread *); static void tdq_load_rem(struct tdq *, struct thread *); static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); static inline int sched_shouldpreempt(int, int, int); void tdq_print(int cpu); static void runq_print(struct runq *rq); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static int tdq_move(struct tdq *, struct tdq *); static int tdq_idled(struct tdq *); static void tdq_notify(struct tdq *, struct thread *); static struct thread *tdq_steal(struct tdq *, int); static struct thread *runq_steal(struct runq *, int); static int sched_pickcpu(struct thread *, int); static void sched_balance(void); static int sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct thread *, int, int); static inline void thread_unblock_switch(struct thread *, struct mtx *); static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS); static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent); #endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); /* * Print the threads waiting on a run-queue. */ static void runq_print(struct runq *rq) { struct rqhead *rqh; struct thread *td; int pri; int j; int i; for (i = 0; i < RQB_LEN; i++) { printf("\t\trunq bits %d 0x%zx\n", i, rq->rq_status.rqb_bits[i]); for (j = 0; j < RQB_BPW; j++) if (rq->rq_status.rqb_bits[i] & (1ul << j)) { pri = j + (i << RQB_L2BPW); rqh = &rq->rq_queues[pri]; TAILQ_FOREACH(td, rqh, td_runq) { printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", td, td->td_name, td->td_priority, td->td_rqindex, pri); } } } } /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ void tdq_print(int cpu) { struct tdq *tdq; tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt); printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); printf("\trealtime runq:\n"); runq_print(&tdq->tdq_realtime); printf("\ttimeshare runq:\n"); runq_print(&tdq->tdq_timeshare); printf("\tidle runq:\n"); runq_print(&tdq->tdq_idle); } static inline int sched_shouldpreempt(int pri, int cpri, int remote) { /* * If the new priority is not better than the current priority there is * nothing to do. */ if (pri >= cpri) return (0); /* * Always preempt idle. */ if (cpri >= PRI_MIN_IDLE) return (1); /* * If preemption is disabled don't preempt others. */ if (preempt_thresh == 0) return (0); /* * Preempt if we exceed the threshold. */ if (pri <= preempt_thresh) return (1); /* * If we're interactive or better and there is non-interactive * or worse running preempt only remote processors. */ if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) return (1); return (0); } /* * Add a thread to the actual run-queue. Keeps transferable counts up to * date with what is actually on the run-queue. Selects the correct * queue position for timeshare threads. */ static __inline void tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) { struct td_sched *ts; u_char pri; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td_get_sched(td); TD_SET_RUNQ(td); if (THREAD_CAN_MIGRATE(td)) { tdq->tdq_transferable++; ts->ts_flags |= TSF_XFERABLE; } if (pri < PRI_MIN_BATCH) { ts->ts_runq = &tdq->tdq_realtime; } else if (pri <= PRI_MAX_BATCH) { ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH, ("Invalid priority %d on timeshare runq", pri)); /* * This queue contains only priorities between MIN and MAX * realtime. Use the whole queue to represent these values. */ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE; pri = (pri + tdq->tdq_idx) % RQ_NQS; /* * This effectively shortens the queue by one so we * can have a one slot difference between idx and * ridx while we wait for threads to drain. */ if (tdq->tdq_ridx != tdq->tdq_idx && pri == tdq->tdq_ridx) pri = (unsigned char)(pri - 1) % RQ_NQS; } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, td, pri, flags); return; } else ts->ts_runq = &tdq->tdq_idle; runq_add(ts->ts_runq, td, flags); } /* * Remove a thread from a run-queue. This typically happens when a thread * is selected to run. Running threads are not on the queue and the * transferable count does not reflect them. */ static __inline void tdq_runq_rem(struct tdq *tdq, struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(ts->ts_runq != NULL, ("tdq_runq_remove: thread %p null ts_runq", td)); if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } if (ts->ts_runq == &tdq->tdq_timeshare) { if (tdq->tdq_idx != tdq->tdq_ridx) runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, td, NULL); } else runq_remove(ts->ts_runq, td); } /* * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load * for this thread to the referenced thread queue. */ static void tdq_load_add(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); tdq->tdq_load++; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload++; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Remove the load from a thread that is transitioning to a sleep state or * exiting. */ static void tdq_load_rem(struct tdq *tdq, struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload--; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Bound timeshare latency by decreasing slice size as load increases. We * consider the maximum latency as the sum of the threads waiting to run * aside from curthread and target no more than sched_slice latency but * no less than sched_slice_min runtime. */ static inline int tdq_slice(struct tdq *tdq) { int load; /* * It is safe to use sys_load here because this is called from * contexts where timeshare threads are running and so there * cannot be higher priority load in the system. */ load = tdq->tdq_sysload - 1; if (load >= SCHED_SLICE_MIN_DIVISOR) return (sched_slice_min); if (load <= 1) return (sched_slice); return (sched_slice / load); } /* * Set lowpri to its exact value by searching the run-queue and * evaluating curthread. curthread may be passed as an optimization. */ static void tdq_setlowpri(struct tdq *tdq, struct thread *ctd) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (ctd == NULL) ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; td = tdq_choose(tdq); if (td == NULL || td->td_priority > ctd->td_priority) tdq->tdq_lowpri = ctd->td_priority; else tdq->tdq_lowpri = td->td_priority; } #ifdef SMP /* * We need some randomness. Implement a classic Linear Congruential * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits * of the random state (in the low bits of our answer) to keep * the maximum randomness. */ static uint32_t sched_random(void) { uint32_t *rndptr; rndptr = DPCPU_PTR(randomval); *rndptr = *rndptr * 69069 + 5; return (*rndptr >> 16); } struct cpu_search { cpuset_t cs_mask; u_int cs_prefer; int cs_pri; /* Min priority for low. */ int cs_limit; /* Max load for low, min load for high. */ int cs_cpu; int cs_load; }; #define CPU_SEARCH_LOWEST 0x1 #define CPU_SEARCH_HIGHEST 0x2 #define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) #define CPUSET_FOREACH(cpu, mask) \ for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \ if (CPU_ISSET(cpu, &mask)) static __always_inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match); int __noinline cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low); int __noinline cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high); int __noinline cpu_search_both(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high); /* * Search the tree of cpu_groups for the lowest or highest loaded cpu * according to the match argument. This routine actually compares the * load on all paths through the tree and finds the least loaded cpu on * the least loaded path, which may differ from the least loaded cpu in * the system. This balances work among caches and buses. * * This inline is instantiated in three forms below using constants for the * match argument. It is reduced to the minimum set for each case. It is * also recursive to the depth of the tree. */ static __always_inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match) { struct cpu_search lgroup; struct cpu_search hgroup; cpuset_t cpumask; struct cpu_group *child; struct tdq *tdq; int cpu, i, hload, lload, load, total, rnd; total = 0; cpumask = cg->cg_mask; if (match & CPU_SEARCH_LOWEST) { lload = INT_MAX; lgroup = *low; } if (match & CPU_SEARCH_HIGHEST) { hload = INT_MIN; hgroup = *high; } /* Iterate through the child CPU groups and then remaining CPUs. */ for (i = cg->cg_children, cpu = mp_maxid; ; ) { if (i == 0) { #ifdef HAVE_INLINE_FFSL cpu = CPU_FFS(&cpumask) - 1; #else while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask)) cpu--; #endif if (cpu < 0) break; child = NULL; } else child = &cg->cg_child[i - 1]; if (match & CPU_SEARCH_LOWEST) lgroup.cs_cpu = -1; if (match & CPU_SEARCH_HIGHEST) hgroup.cs_cpu = -1; if (child) { /* Handle child CPU group. */ CPU_NAND(&cpumask, &child->cg_mask); switch (match) { case CPU_SEARCH_LOWEST: load = cpu_search_lowest(child, &lgroup); break; case CPU_SEARCH_HIGHEST: load = cpu_search_highest(child, &hgroup); break; case CPU_SEARCH_BOTH: load = cpu_search_both(child, &lgroup, &hgroup); break; } } else { /* Handle child CPU. */ CPU_CLR(cpu, &cpumask); tdq = TDQ_CPU(cpu); load = tdq->tdq_load * 256; rnd = sched_random() % 32; if (match & CPU_SEARCH_LOWEST) { if (cpu == low->cs_prefer) load -= 64; /* If that CPU is allowed and get data. */ if (tdq->tdq_lowpri > lgroup.cs_pri && tdq->tdq_load <= lgroup.cs_limit && CPU_ISSET(cpu, &lgroup.cs_mask)) { lgroup.cs_cpu = cpu; lgroup.cs_load = load - rnd; } } if (match & CPU_SEARCH_HIGHEST) if (tdq->tdq_load >= hgroup.cs_limit && tdq->tdq_transferable && CPU_ISSET(cpu, &hgroup.cs_mask)) { hgroup.cs_cpu = cpu; hgroup.cs_load = load - rnd; } } total += load; /* We have info about child item. Compare it. */ if (match & CPU_SEARCH_LOWEST) { if (lgroup.cs_cpu >= 0 && (load < lload || (load == lload && lgroup.cs_load < low->cs_load))) { lload = load; low->cs_cpu = lgroup.cs_cpu; low->cs_load = lgroup.cs_load; } } if (match & CPU_SEARCH_HIGHEST) if (hgroup.cs_cpu >= 0 && (load > hload || (load == hload && hgroup.cs_load > high->cs_load))) { hload = load; high->cs_cpu = hgroup.cs_cpu; high->cs_load = hgroup.cs_load; } if (child) { i--; if (i == 0 && CPU_EMPTY(&cpumask)) break; } #ifndef HAVE_INLINE_FFSL else cpu--; #endif } return (total); } /* * cpu_search instantiations must pass constants to maintain the inline * optimization. */ int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low) { return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); } int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high) { return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); } int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high) { return cpu_search(cg, low, high, CPU_SEARCH_BOTH); } /* * Find the cpu with the least load via the least loaded path that has a * lowpri greater than pri pri. A pri of -1 indicates any priority is * acceptable. */ static inline int sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload, int prefer) { struct cpu_search low; low.cs_cpu = -1; low.cs_prefer = prefer; low.cs_mask = mask; low.cs_pri = pri; low.cs_limit = maxload; cpu_search_lowest(cg, &low); return low.cs_cpu; } /* * Find the cpu with the highest load via the highest loaded path. */ static inline int sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload) { struct cpu_search high; high.cs_cpu = -1; high.cs_mask = mask; high.cs_limit = minload; cpu_search_highest(cg, &high); return high.cs_cpu; } static void sched_balance_group(struct cpu_group *cg) { cpuset_t hmask, lmask; int high, low, anylow; CPU_FILL(&hmask); for (;;) { high = sched_highest(cg, hmask, 1); /* Stop if there is no more CPU with transferrable threads. */ if (high == -1) break; CPU_CLR(high, &hmask); CPU_COPY(&hmask, &lmask); /* Stop if there is no more CPU left for low. */ if (CPU_EMPTY(&lmask)) break; anylow = 1; nextlow: low = sched_lowest(cg, lmask, -1, TDQ_CPU(high)->tdq_load - 1, high); /* Stop if we looked well and found no less loaded CPU. */ if (anylow && low == -1) break; /* Go to next high if we found no less loaded CPU. */ if (low == -1) continue; /* Transfer thread from high to low. */ if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) { /* CPU that got thread can no longer be a donor. */ CPU_CLR(low, &hmask); } else { /* * If failed, then there is no threads on high * that can run on this low. Drop low from low * mask and look for different one. */ CPU_CLR(low, &lmask); anylow = 0; goto nextlow; } } } static void sched_balance(void) { struct tdq *tdq; if (smp_started == 0 || rebalance == 0) return; balance_ticks = max(balance_interval / 2, 1) + (sched_random() % balance_interval); tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* * Lock two thread queues using their address to maintain lock order. */ static void tdq_lock_pair(struct tdq *one, struct tdq *two) { if (one < two) { TDQ_LOCK(one); TDQ_LOCK_FLAGS(two, MTX_DUPOK); } else { TDQ_LOCK(two); TDQ_LOCK_FLAGS(one, MTX_DUPOK); } } /* * Unlock two thread queues. Order is not important here. */ static void tdq_unlock_pair(struct tdq *one, struct tdq *two) { TDQ_UNLOCK(one); TDQ_UNLOCK(two); } /* * Transfer load between two imbalanced thread queues. */ static int sched_balance_pair(struct tdq *high, struct tdq *low) { int moved; int cpu; tdq_lock_pair(high, low); moved = 0; /* * Determine what the imbalance is and then adjust that to how many * threads we actually have to give up (transferable). */ if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load && (moved = tdq_move(high, low)) > 0) { /* * In case the target isn't the current cpu IPI it to force a * reschedule with the new workload. */ cpu = TDQ_ID(low); if (cpu != PCPU_GET(cpuid)) ipi_cpu(cpu, IPI_PREEMPT); } tdq_unlock_pair(high, low); return (moved); } /* * Move a thread from one thread queue to another. */ static int tdq_move(struct tdq *from, struct tdq *to) { struct td_sched *ts; struct thread *td; struct tdq *tdq; int cpu; TDQ_LOCK_ASSERT(from, MA_OWNED); TDQ_LOCK_ASSERT(to, MA_OWNED); tdq = from; cpu = TDQ_ID(to); td = tdq_steal(tdq, cpu); if (td == NULL) return (0); ts = td_get_sched(td); /* * Although the run queue is locked the thread may be blocked. Lock * it to clear this and acquire the run-queue lock. */ thread_lock(td); /* Drop recursive lock on from acquired via thread_lock(). */ TDQ_UNLOCK(from); sched_rem(td); ts->ts_cpu = cpu; td->td_lock = TDQ_LOCKPTR(to); tdq_add(to, td, SRQ_YIELDING); return (1); } /* * This tdq has idled. Try to steal a thread from another cpu and switch * to it. */ static int tdq_idled(struct tdq *tdq) { struct cpu_group *cg; struct tdq *steal; cpuset_t mask; int thresh; int cpu; if (smp_started == 0 || steal_idle == 0) return (1); CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); /* We don't want to be preempted while we're iterating. */ spinlock_enter(); for (cg = tdq->tdq_cg; cg != NULL; ) { if ((cg->cg_flags & CG_FLAG_THREAD) == 0) thresh = steal_thresh; else thresh = 1; cpu = sched_highest(cg, mask, thresh); if (cpu == -1) { cg = cg->cg_parent; continue; } steal = TDQ_CPU(cpu); CPU_CLR(cpu, &mask); tdq_lock_pair(tdq, steal); if (steal->tdq_load < thresh || steal->tdq_transferable == 0) { tdq_unlock_pair(tdq, steal); continue; } /* * If a thread was added while interrupts were disabled don't * steal one here. If we fail to acquire one due to affinity * restrictions loop again with this cpu removed from the * set. */ if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) { tdq_unlock_pair(tdq, steal); continue; } spinlock_exit(); TDQ_UNLOCK(steal); mi_switch(SW_VOL | SWT_IDLE, NULL); thread_unlock(curthread); return (0); } spinlock_exit(); return (1); } /* * Notify a remote cpu of new work. Sends an IPI if criteria are met. */ static void tdq_notify(struct tdq *tdq, struct thread *td) { struct thread *ctd; int pri; int cpu; if (tdq->tdq_ipipending) return; cpu = td_get_sched(td)->ts_cpu; pri = td->td_priority; ctd = pcpu_find(cpu)->pc_curthread; if (!sched_shouldpreempt(pri, ctd->td_priority, 1)) return; /* * Make sure that our caller's earlier update to tdq_load is * globally visible before we read tdq_cpu_idle. Idle thread * accesses both of them without locks, and the order is important. */ atomic_thread_fence_seq_cst(); if (TD_IS_IDLETHREAD(ctd)) { /* * If the MD code has an idle wakeup routine try that before * falling back to IPI. */ if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu)) return; } tdq->tdq_ipipending = 1; ipi_cpu(cpu, IPI_PREEMPT); } /* * Steals load from a timeshare queue. Honors the rotating queue head * index. */ static struct thread * runq_steal_from(struct runq *rq, int cpu, u_char start) { struct rqbits *rqb; struct rqhead *rqh; struct thread *td, *first; int bit; int i; rqb = &rq->rq_status; bit = start & (RQB_BPW -1); first = NULL; again: for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { if (rqb->rqb_bits[i] == 0) continue; if (bit == 0) bit = RQB_FFS(rqb->rqb_bits[i]); for (; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[i] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) { if (first && THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); first = td; } } } if (start != 0) { start = 0; goto again; } if (first && THREAD_CAN_MIGRATE(first) && THREAD_CAN_SCHED(first, cpu)) return (first); return (NULL); } /* * Steals load from a standard linear queue. */ static struct thread * runq_steal(struct runq *rq, int cpu) { struct rqhead *rqh; struct rqbits *rqb; struct thread *td; int word; int bit; rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) continue; for (bit = 0; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } } return (NULL); } /* * Attempt to steal a thread in priority order from a thread queue. */ static struct thread * tdq_steal(struct tdq *tdq, int cpu) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) return (td); if ((td = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) != NULL) return (td); return (runq_steal(&tdq->tdq_idle, cpu)); } /* * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the * current lock and returns with the assigned queue locked. */ static inline struct tdq * sched_setcpu(struct thread *td, int cpu, int flags) { struct tdq *tdq; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_CPU(cpu); td_get_sched(td)->ts_cpu = cpu; /* * If the lock matches just return the queue. */ if (td->td_lock == TDQ_LOCKPTR(tdq)) return (tdq); #ifdef notyet /* * If the thread isn't running its lockptr is a * turnstile or a sleepqueue. We can just lock_set without * blocking. */ if (TD_CAN_RUN(td)) { TDQ_LOCK(tdq); thread_lock_set(td, TDQ_LOCKPTR(tdq)); return (tdq); } #endif /* * The hard case, migration, we need to block the thread first to * prevent order reversals with other cpus locks. */ spinlock_enter(); thread_lock_block(td); TDQ_LOCK(tdq); thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); spinlock_exit(); return (tdq); } SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding"); SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load"); SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu"); SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); static int sched_pickcpu(struct thread *td, int flags) { struct cpu_group *cg, *ccg; struct td_sched *ts; struct tdq *tdq; cpuset_t mask; int cpu, pri, self; self = PCPU_GET(cpuid); ts = td_get_sched(td); KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on " "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name)); if (smp_started == 0) return (self); /* * Don't migrate a running thread from sched_switch(). */ if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) return (ts->ts_cpu); /* * Prefer to run interrupt threads on the processors that generate * the interrupt. */ pri = td->td_priority; if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && curthread->td_intr_nesting_level && ts->ts_cpu != self) { SCHED_STAT_INC(pickcpu_intrbind); ts->ts_cpu = self; if (TDQ_CPU(self)->tdq_lowpri > pri) { SCHED_STAT_INC(pickcpu_affinity); return (ts->ts_cpu); } } /* * If the thread can run on the last cpu and the affinity has not * expired or it is idle run it there. */ tdq = TDQ_CPU(ts->ts_cpu); cg = tdq->tdq_cg; if (THREAD_CAN_SCHED(td, ts->ts_cpu) && tdq->tdq_lowpri >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { if (cg->cg_flags & CG_FLAG_THREAD) { CPUSET_FOREACH(cpu, cg->cg_mask) { if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) break; } } else cpu = INT_MAX; if (cpu > mp_maxid) { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } /* * Search for the last level cache CPU group in the tree. * Skip caches with expired affinity time and SMT groups. * Affinity to higher level caches will be handled less aggressively. */ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { if (cg->cg_flags & CG_FLAG_THREAD) continue; if (!SCHED_AFFINITY(ts, cg->cg_level)) continue; ccg = cg; } if (ccg != NULL) cg = ccg; cpu = -1; /* Search the group for the less loaded idle CPU we can run now. */ mask = td->td_cpuset->cs_mask; if (cg != NULL && cg != cpu_top && CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0) cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE), INT_MAX, ts->ts_cpu); /* Search globally for the less loaded CPU we can run now. */ if (cpu == -1) cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu); /* Search globally for the less loaded CPU. */ if (cpu == -1) cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu); KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu.")); KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu)); /* * Compare the lowest loaded cpu to current cpu. */ if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri && TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE && TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; } else SCHED_STAT_INC(pickcpu_lowest); if (cpu != ts->ts_cpu) SCHED_STAT_INC(pickcpu_migration); return (cpu); } #endif /* * Pick the highest priority task we have and return it. */ static struct thread * tdq_choose(struct tdq *tdq) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = runq_choose(&tdq->tdq_realtime); if (td != NULL) return (td); td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_BATCH, ("tdq_choose: Invalid priority on timeshare queue %d", td->td_priority)); return (td); } td = runq_choose(&tdq->tdq_idle); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_IDLE, ("tdq_choose: Invalid priority on idle queue %d", td->td_priority)); return (td); } return (NULL); } /* * Initialize a thread queue. */ static void tdq_setup(struct tdq *tdq) { if (bootverbose) printf("ULE: setup cpu %d\n", TDQ_ID(tdq)); runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), "sched lock %d", (int)TDQ_ID(tdq)); mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN | MTX_RECURSE); #ifdef KTR snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname), "CPU %d load", (int)TDQ_ID(tdq)); #endif } #ifdef SMP static void sched_setup_smp(void) { struct tdq *tdq; int i; cpu_top = smp_topo(); CPU_FOREACH(i) { tdq = TDQ_CPU(i); tdq_setup(tdq); tdq->tdq_cg = smp_topo_find(cpu_top, i); if (tdq->tdq_cg == NULL) panic("Can't find cpu group for %d\n", i); } balance_tdq = TDQ_SELF(); sched_balance(); } #endif /* * Setup the thread queues and initialize the topology based on MD * information. */ static void sched_setup(void *dummy) { struct tdq *tdq; tdq = TDQ_SELF(); #ifdef SMP sched_setup_smp(); #else tdq_setup(tdq); #endif /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); td_get_sched(&thread0)->ts_cpu = curcpu; /* Something valid to start */ thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); tdq_load_add(tdq, &thread0); tdq->tdq_lowpri = thread0.td_priority; TDQ_UNLOCK(tdq); } /* * This routine determines time constants after stathz and hz are setup. */ /* ARGSUSED */ static void sched_initticks(void *dummy) { int incr; realstathz = stathz ? stathz : hz; sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR; sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); /* * tickincr is shifted out by 10 to avoid rounding errors due to * hz not being evenly divisible by stathz on all platforms. */ incr = (hz << SCHED_TICK_SHIFT) / realstathz; /* * This does not work for values of stathz that are more than * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. */ if (incr == 0) incr = 1; tickincr = incr; #ifdef SMP /* * Set the default balance interval now that we know * what realstathz is. */ balance_interval = realstathz; affinity = SCHED_AFFINITY_DEFAULT; #endif if (sched_idlespinthresh < 0) sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz; } /* * This is the core of the interactivity algorithm. Determines a score based * on past behavior. It is the ratio of sleep time to run time scaled to * a [0, 100] integer. This is the voluntary sleep time of a process, which * differs from the cpu usage because it does not account for time spent * waiting on a run-queue. Would be prettier if we had floating point. * * When a thread's sleep time is greater than its run time the * calculation is: * * scaling factor * interactivity score = --------------------- * sleep time / run time * * * When a thread's run time is greater than its sleep time the * calculation is: * * scaling factor * interactivity score = --------------------- + scaling factor * run time / sleep time */ static int sched_interact_score(struct thread *td) { struct td_sched *ts; int div; ts = td_get_sched(td); /* * The score is only needed if this is likely to be an interactive * task. Don't go through the expense of computing it if there's * no chance. */ if (sched_interact <= SCHED_INTERACT_HALF && ts->ts_runtime >= ts->ts_slptime) return (SCHED_INTERACT_HALF); if (ts->ts_runtime > ts->ts_slptime) { div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); } if (ts->ts_slptime > ts->ts_runtime) { div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); return (ts->ts_runtime / div); } /* runtime == slptime */ if (ts->ts_runtime) return (SCHED_INTERACT_HALF); /* * This can happen if slptime and runtime are 0. */ return (0); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct thread *td) { int score; int pri; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; /* * If the score is interactive we place the thread in the realtime * queue with a priority that is less than kernel and interrupt * priorities. These threads are not subject to nice restrictions. * * Scores greater than this are placed on the normal timeshare queue * where the priority is partially decided by the most recent cpu * utilization and the rest is decided by nice value. * * The nice value of the process has a linear effect on the calculated * score. Negative nice values make it easier for a thread to be * considered interactive. */ score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_INTERACT; pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) / sched_interact) * score; KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, ("sched_priority: invalid interactive priority %d score %d", pri, score)); } else { pri = SCHED_PRI_MIN; if (td_get_sched(td)->ts_ticks) pri += min(SCHED_PRI_TICKS(td_get_sched(td)), SCHED_PRI_RANGE - 1); pri += SCHED_PRI_NICE(td->td_proc->p_nice); KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, ("sched_priority: invalid priority %d: nice %d, " "ticks %d ftick %d ltick %d tick pri %d", pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks, td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick, SCHED_PRI_TICKS(td_get_sched(td)))); } sched_user_prio(td, pri); return; } /* * This routine enforces a maximum limit on the amount of scheduling history * kept. It is called after either the slptime or runtime is adjusted. This * function is ugly due to integer math. */ static void sched_interact_update(struct thread *td) { struct td_sched *ts; u_int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum < SCHED_SLP_RUN_MAX) return; /* * This only happens from two places: * 1) We have added an unusual amount of run time from fork_exit. * 2) We have added an unusual amount of sleep time from sched_sleep(). */ if (sum > SCHED_SLP_RUN_MAX * 2) { if (ts->ts_runtime > ts->ts_slptime) { ts->ts_runtime = SCHED_SLP_RUN_MAX; ts->ts_slptime = 1; } else { ts->ts_slptime = SCHED_SLP_RUN_MAX; ts->ts_runtime = 1; } return; } /* * If we have exceeded by more than 1/5th then the algorithm below * will not bring us back into range. Dividing by two here forces * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] */ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { ts->ts_runtime /= 2; ts->ts_slptime /= 2; return; } ts->ts_runtime = (ts->ts_runtime / 5) * 4; ts->ts_slptime = (ts->ts_slptime / 5) * 4; } /* * Scale back the interactivity history when a child thread is created. The * history is inherited from the parent but the thread may behave totally * differently. For example, a shell spawning a compiler process. We want * to learn that the compiler is behaving badly very quickly. */ static void sched_interact_fork(struct thread *td) { struct td_sched *ts; int ratio; int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum > SCHED_SLP_RUN_FORK) { ratio = sum / SCHED_SLP_RUN_FORK; ts->ts_runtime /= ratio; ts->ts_slptime /= ratio; } } /* * Called from proc0_init() to setup the scheduler fields. */ void schedinit(void) { struct td_sched *ts0; /* * Set up the scheduler specific parts of thread0. */ ts0 = td_get_sched(&thread0); ts0->ts_ltick = ticks; ts0->ts_ftick = ticks; ts0->ts_slice = 0; } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most sched_slice stathz ticks. */ int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * Update the percent cpu tracking information when it is requested or * the total history exceeds the maximum. We keep a sliding history of * tick counts that slowly decays. This is less precise than the 4BSD * mechanism since it happens with less regular and frequent events. */ static void sched_pctcpu_update(struct td_sched *ts, int run) { int t = ticks; /* * The signed difference may be negative if the thread hasn't run for * over half of the ticks rollover period. */ if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) { ts->ts_ticks = 0; ts->ts_ftick = t - SCHED_TICK_TARG; } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) { ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) * (ts->ts_ltick - (t - SCHED_TICK_TARG)); ts->ts_ftick = t - SCHED_TICK_TARG; } if (run) ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT; ts->ts_ltick = t; } /* * Adjust the priority of a thread. Move it to the appropriate run-queue * if necessary. This is the back-end for several priority related * functions. */ static void sched_thread_priority(struct thread *td, u_char prio) { struct td_sched *ts; struct tdq *tdq; int oldpri; KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio < td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. This could be optimized to not re-add in some * cases. */ if (TD_ON_RUNQ(td) && prio < td->td_priority) { sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING); return; } /* * If the thread is currently running we may have to adjust the lowpri * information so other cpus are aware of our current priority. */ if (TD_IS_RUNNING(td)) { tdq = TDQ_CPU(ts->ts_cpu); oldpri = td->td_priority; td->td_priority = prio; if (prio < tdq->tdq_lowpri) tdq->tdq_lowpri = prio; else if (tdq->tdq_lowpri == oldpri) tdq_setlowpri(tdq, td); return; } td->td_priority = prio; } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_thread_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regular priority is less * important than prio, the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_thread_priority(td, base_pri); } else sched_lend_prio(td, prio); } /* * Standard entry for setting the priority to an absolute value. */ void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't * ever lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_thread_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } /* * Set the base user priority, does not effect current running priority. */ void sched_user_prio(struct thread *td, u_char prio) { td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) td->td_flags |= TDF_NEEDRESCHED; } /* * Handle migration from sched_switch(). This happens only for * cpu binding. */ static struct mtx * sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) { struct tdq *tdn; KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: " "thread %s queued on absent CPU %d.", td->td_name, td_get_sched(td)->ts_cpu)); tdn = TDQ_CPU(td_get_sched(td)->ts_cpu); #ifdef SMP tdq_load_rem(tdq, td); /* * Do the lock dance required to avoid LOR. We grab an extra * spinlock nesting to prevent preemption while we're * not holding either run-queue lock. */ spinlock_enter(); thread_lock_block(td); /* This releases the lock on tdq. */ /* * Acquire both run-queue locks before placing the thread on the new * run-queue to avoid deadlocks created by placing a thread with a * blocked lock on the run-queue of a remote processor. The deadlock * occurs when a third processor attempts to lock the two queues in * question while the target processor is spinning with its own * run-queue lock held while waiting for the blocked lock to clear. */ tdq_lock_pair(tdn, tdq); tdq_add(tdn, td, flags); tdq_notify(tdn, td); TDQ_UNLOCK(tdn); spinlock_exit(); #endif return (TDQ_LOCKPTR(tdn)); } /* * Variadic version of thread_lock_unblock() that does not assume td_lock * is blocked. */ static inline void thread_unblock_switch(struct thread *td, struct mtx *mtx) { atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, (uintptr_t)mtx); } /* * Switch threads. This function has to handle threads coming in while * blocked for some reason, running, or idle. It also must deal with * migrating a thread from one queue to another as running threads may * be assigned elsewhere via binding. */ void sched_switch(struct thread *td, struct thread *newtd, int flags) { struct tdq *tdq; struct td_sched *ts; struct mtx *mtx; int srqflag; int cpuid, preempted; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument")); cpuid = PCPU_GET(cpuid); tdq = TDQ_CPU(cpuid); ts = td_get_sched(td); mtx = td->td_lock; sched_pctcpu_update(ts, 1); ts->ts_rltick = ticks; td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND); td->td_owepreempt = 0; if (!TD_IS_IDLETHREAD(td)) tdq->tdq_switchcnt++; /* * The lock pointer in an idle thread should never change. Reset it * to CAN_RUN as well. */ if (TD_IS_IDLETHREAD(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); srqflag = preempted ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; #ifdef SMP if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu)) ts->ts_cpu = sched_pickcpu(td, 0); #endif if (ts->ts_cpu == cpuid) tdq_runq_add(tdq, td, srqflag); else { KASSERT(THREAD_CAN_MIGRATE(td) || (ts->ts_flags & TSF_BOUND) != 0, ("Thread %p shouldn't migrate", td)); mtx = sched_switch_migrate(tdq, td, srqflag); } } else { /* This thread must be going to sleep. */ TDQ_LOCK(tdq); mtx = thread_lock_block(td); tdq_load_rem(tdq, td); } #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif /* * We enter here with the thread blocked and assigned to the * appropriate cpu run-queue or sleep-queue and with the current * thread-queue locked. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); newtd = choosethread(); /* * Call the MD code to switch contexts if necessary. */ if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; sched_pctcpu_update(td_get_sched(newtd), 0); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif cpu_switch(td, newtd, mtx); /* * We may return from cpu_switch on a different cpu. However, * we always return with td_lock pointing to the current cpu's * run queue lock. */ cpuid = PCPU_GET(cpuid); tdq = TDQ_CPU(cpuid); lock_profile_obtain_lock_success( &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else { thread_unblock_switch(td, mtx); SDT_PROBE0(sched, , , remain__cpu); } KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); /* * Assert that all went well and return. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; } /* * Adjust thread priorities as a result of a nice request. */ void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); sched_priority(td); sched_prio(td, td->td_base_user_pri); thread_unlock(td); } } /* * Record the sleep time for the interactivity scorer. */ void sched_sleep(struct thread *td, int prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; if (TD_IS_SUSPENDED(td) || prio >= PSOCK) td->td_flags |= TDF_CANSWAP; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; if (static_boost == 1 && prio) sched_prio(td, prio); else if (static_boost && td->td_priority > static_boost) sched_prio(td, static_boost); } /* * Schedule a thread to resume execution and record how long it voluntarily * slept. We also update the pctcpu, interactivity, and priority. */ void sched_wakeup(struct thread *td) { struct td_sched *ts; int slptick; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); td->td_flags &= ~TDF_CANSWAP; /* * If we slept for more than a tick update our interactivity and * priority. */ slptick = td->td_slptick; td->td_slptick = 0; if (slptick && slptick != ticks) { ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; sched_interact_update(td); sched_pctcpu_update(ts, 0); } /* * Reset the slice value since we slept and advanced the round-robin. */ ts->ts_slice = 0; sched_add(td, SRQ_BORING); } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct thread *td, struct thread *child) { THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(td_get_sched(td), 1); sched_fork_thread(td, child); /* * Penalize the parent and child for forking. */ sched_interact_fork(child); sched_priority(child); td_get_sched(td)->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Fork a new thread, may be within the same process. */ void sched_fork_thread(struct thread *td, struct thread *child) { struct td_sched *ts; struct td_sched *ts2; struct tdq *tdq; tdq = TDQ_SELF(); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Initialize child. */ ts = td_get_sched(td); ts2 = td_get_sched(child); child->td_oncpu = NOCPU; child->td_lastcpu = NOCPU; child->td_lock = TDQ_LOCKPTR(tdq); child->td_cpuset = cpuset_ref(td->td_cpuset); ts2->ts_cpu = ts->ts_cpu; ts2->ts_flags = 0; /* * Grab our parents cpu estimation information. */ ts2->ts_ticks = ts->ts_ticks; ts2->ts_ltick = ts->ts_ltick; ts2->ts_ftick = ts->ts_ftick; /* * Do not inherit any borrowed priority from the parent. */ child->td_priority = child->td_base_pri; /* * And update interactivity score. */ ts2->ts_slptime = ts->ts_slptime; ts2->ts_runtime = ts->ts_runtime; /* Attempt to quickly learn interactivity. */ ts2->ts_slice = tdq_slice(tdq) - sched_slice_min; #ifdef KTR bzero(ts2->ts_name, sizeof(ts2->ts_name)); #endif } /* * Adjust the priority class of a thread. */ void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; td->td_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct thread *child) { struct thread *td; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit", "prio:%d", child->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td, child); } /* * Penalize another thread for the time spent on this one. This helps to * worsen the priority and interactivity of processes which schedule batch * jobs such as make. This has little effect on the make process itself but * causes new processes spawned by it to receive worse scores immediately. */ void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit", "prio:%d", child->td_priority); /* * Give the child's runtime to the parent without returning the * sleep time as a penalty to the parent. This causes shells that * launch expensive things to mark their children as expensive. */ thread_lock(td); td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime; sched_interact_update(td); sched_priority(td); thread_unlock(td); } void sched_preempt(struct thread *td) { struct tdq *tdq; SDT_PROBE2(sched, , , surrender, td, td->td_proc); thread_lock(td); tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); tdq->tdq_ipipending = 0; if (td->td_priority > tdq->tdq_lowpri) { int flags; flags = SW_INVOL | SW_PREEMPT; if (td->td_critnest > 1) td->td_owepreempt = 1; else if (TD_IS_IDLETHREAD(td)) mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL); else mi_switch(flags | SWT_REMOTEPREEMPT, NULL); } thread_unlock(td); } /* * Fix priorities on return to user-space. Priorities may be elevated due * to static priorities in msleep() or similar. */ void sched_userret(struct thread *td) { /* * XXX we cheat slightly on the locking here to avoid locking in * the usual case. Setting td_priority here is essentially an * incomplete workaround for not setting it properly elsewhere. * Now that some interrupt handlers are threads, not setting it * properly elsewhere can clobber it in the window between setting * it here and returning to user mode, so don't waste time setting * it perfectly here. */ KASSERT((td->td_flags & TDF_BORROWING) == 0, ("thread with borrowed priority returning to userland")); if (td->td_priority != td->td_user_pri) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; tdq_setlowpri(TDQ_SELF(), td); thread_unlock(td); } } /* * Handle a stathz tick. This is really only relevant for timeshare * threads. */ void sched_clock(struct thread *td) { struct tdq *tdq; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_SELF(); #ifdef SMP /* * We run the long term load balancer infrequently on the first cpu. */ if (balance_tdq == tdq) { if (balance_ticks && --balance_ticks == 0) sched_balance(); } #endif /* * Save the old switch count so we have a record of the last ticks * activity. Initialize the new switch count based on our load. * If there is some activity seed it to reflect that. */ tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt; tdq->tdq_switchcnt = tdq->tdq_load; /* * Advance the insert index once for each tick to ensure that all * threads get a chance to run. */ if (tdq->tdq_idx == tdq->tdq_ridx) { tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) tdq->tdq_ridx = tdq->tdq_idx; } ts = td_get_sched(td); sched_pctcpu_update(ts, 1); if (td->td_pri_class & PRI_FIFO_BIT) return; if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) { /* * We used a tick; charge it to the thread so * that we can compute our interactivity. */ td_get_sched(td)->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) { ts->ts_slice = 0; td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; } } u_int sched_estcpu(struct thread *td __unused) { return (0); } /* * Return whether the current CPU has runnable tasks. Used for in-kernel * cooperative idle threads. */ int sched_runnable(void) { struct tdq *tdq; int load; load = 1; tdq = TDQ_SELF(); if ((curthread->td_flags & TDF_IDLETD) != 0) { if (tdq->tdq_load > 0) goto out; } else if (tdq->tdq_load - 1 > 0) goto out; load = 0; out: return (load); } /* * Choose the highest priority thread to run. The thread is removed from * the run-queue while running however the load remains. For SMP we set * the tdq in the global idle bitmask if it idles here. */ struct thread * sched_choose(void) { struct thread *td; struct tdq *tdq; tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = tdq_choose(tdq); if (td) { tdq_runq_rem(tdq, td); tdq->tdq_lowpri = td->td_priority; return (td); } tdq->tdq_lowpri = PRI_MAX_IDLE; return (PCPU_GET(idlethread)); } /* * Set owepreempt if necessary. Preemption never happens directly in ULE, * we always request it once we exit a critical section. */ static inline void sched_setpreempt(struct thread *td) { struct thread *ctd; int cpri; int pri; THREAD_LOCK_ASSERT(curthread, MA_OWNED); ctd = curthread; pri = td->td_priority; cpri = ctd->td_priority; if (pri < cpri) ctd->td_flags |= TDF_NEEDRESCHED; if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) return; if (!sched_shouldpreempt(pri, cpri, 0)) return; ctd->td_owepreempt = 1; } /* * Add a thread to a thread queue. Select the appropriate runq and add the * thread to it. This is the internal function called when the tdq is * predetermined. */ void tdq_add(struct tdq *tdq, struct thread *td, int flags) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; tdq_runq_add(tdq, td, flags); tdq_load_add(tdq, td); } /* * Select the target thread queue and add a thread to it. Request * preemption or IPI a remote processor if required. */ void sched_add(struct thread *td, int flags) { struct tdq *tdq; #ifdef SMP int cpu; #endif KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Recalculate the priority before we select the target cpu or * run-queue. */ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_priority(td); #ifdef SMP /* * Pick the destination cpu and if it isn't ours transfer to the * target cpu. */ td_get_sched(td)->ts_cpu = curcpu; /* Pick something valid to start */ cpu = sched_pickcpu(td, flags); tdq = sched_setcpu(td, cpu, flags); tdq_add(tdq, td, flags); if (cpu != PCPU_GET(cpuid)) { tdq_notify(tdq, td); return; } #else tdq = TDQ_SELF(); TDQ_LOCK(tdq); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ thread_lock_set(td, TDQ_LOCKPTR(tdq)); tdq_add(tdq, td, flags); #endif if (!(flags & SRQ_YIELDING)) sched_setpreempt(td); } /* * Remove a thread from a run-queue without running it. This is used * when we're stealing a thread from a remote queue. Otherwise all threads * exit by calling sched_exit_thread() and sched_throw() themselves. */ void sched_rem(struct thread *td) { struct tdq *tdq; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); tdq = TDQ_CPU(td_get_sched(td)->ts_cpu); TDQ_LOCK_ASSERT(tdq, MA_OWNED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); tdq_runq_rem(tdq, td); tdq_load_rem(tdq, td); TD_SET_CAN_RUN(td); if (td->td_priority == tdq->tdq_lowpri) tdq_setlowpri(tdq, NULL); } /* * Fetch cpu utilization information. Updates on demand. */ fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct td_sched *ts; pctcpu = 0; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(ts, TD_IS_RUNNING(td)); if (ts->ts_ticks) { int rtick; /* How many rtick per second ? */ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; } return (pctcpu); } /* * Enforce affinity settings for a thread. Called after adjustments to * cpumask. */ void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); if (THREAD_CAN_SCHED(td, ts->ts_cpu)) return; if (TD_ON_RUNQ(td)) { sched_rem(td); sched_add(td, SRQ_BORING); return; } if (!TD_IS_RUNNING(td)) return; /* * Force a switch before returning to userspace. If the * target thread is not running locally send an ipi to force * the issue. */ td->td_flags |= TDF_NEEDRESCHED; if (td != curthread) ipi_cpu(ts->ts_cpu, IPI_PREEMPT); #endif } /* * Bind a thread to a target cpu. */ void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td_get_sched(td); if (ts->ts_flags & TSF_BOUND) sched_unbind(td); KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td)); ts->ts_flags |= TSF_BOUND; sched_pin(); if (PCPU_GET(cpuid) == cpu) return; ts->ts_cpu = cpu; /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL, NULL); } /* * Release a bound thread. */ void sched_unbind(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); ts = td_get_sched(td); if ((ts->ts_flags & TSF_BOUND) == 0) return; ts->ts_flags &= ~TSF_BOUND; sched_unpin(); } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td_get_sched(td)->ts_flags & TSF_BOUND); } /* * Basic yield call. */ void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); } /* * Return the total system load. */ int sched_load(void) { #ifdef SMP int total; int i; total = 0; CPU_FOREACH(i) total += TDQ_CPU(i)->tdq_sysload; return (total); #else return (TDQ_SELF()->tdq_sysload); #endif } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } #ifdef SMP #define TDQ_IDLESPIN(tdq) \ ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0) #else #define TDQ_IDLESPIN(tdq) 1 #endif /* * The actual idle process. */ void sched_idletd(void *dummy) { struct thread *td; struct tdq *tdq; int oldswitchcnt, switchcnt; int i; mtx_assert(&Giant, MA_NOTOWNED); td = curthread; tdq = TDQ_SELF(); THREAD_NO_SLEEPING(); oldswitchcnt = -1; for (;;) { if (tdq->tdq_load) { thread_lock(td); mi_switch(SW_VOL | SWT_IDLE, NULL); thread_unlock(td); } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #ifdef SMP if (switchcnt != oldswitchcnt) { oldswitchcnt = switchcnt; if (tdq_idled(tdq) == 0) continue; } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #else oldswitchcnt = switchcnt; #endif /* * If we're switching very frequently, spin while checking * for load rather than entering a low power state that * may require an IPI. However, don't do any busy * loops while on SMT machines as this simply steals * cycles from cores doing useful work. */ if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) { for (i = 0; i < sched_idlespins; i++) { if (tdq->tdq_load) break; cpu_spinwait(); } } /* If there was context switch during spin, restart it. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt) continue; /* Run main MD idle handler. */ tdq->tdq_cpu_idle = 1; /* * Make sure that tdq_cpu_idle update is globally visible * before cpu_idle() read tdq_load. The order is important * to avoid race with tdq_notify. */ atomic_thread_fence_seq_cst(); cpu_idle(switchcnt * 4 > sched_idlespinthresh); tdq->tdq_cpu_idle = 0; /* * Account thread-less hardware interrupts and * other wakeup reasons equal to context switches. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (switchcnt != oldswitchcnt) continue; tdq->tdq_switchcnt++; oldswitchcnt++; } } /* * A CPU is entering for the first time or a thread is exiting. */ void sched_throw(struct thread *td) { struct thread *newtd; struct tdq *tdq; tdq = TDQ_SELF(); if (td == NULL) { /* Correct spinlock nesting and acquire the correct lock. */ TDQ_LOCK(tdq); spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } else { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); tdq_load_rem(tdq, td); lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; } KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); newtd = choosethread(); TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; cpu_throw(td, newtd); /* doesn't return */ } /* * This is called from fork_exit(). Just acquire the correct locks and * let fork do the rest of the work. */ void sched_fork_exit(struct thread *td) { struct tdq *tdq; int cpuid; /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with the scheduler lock held. */ cpuid = PCPU_GET(cpuid); tdq = TDQ_CPU(cpuid); if (TD_IS_IDLETHREAD(td)) td->td_lock = TDQ_LOCKPTR(tdq); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); lock_profile_obtain_lock_success( &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); SDT_PROBE0(sched, , , on__cpu); } /* * Create on first use to catch odd startup conditons. */ char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td_get_sched(td); if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); ts->ts_name[0] = '\0'; } #endif #ifdef SMP /* * Build the CPU topology dump string. Is recursively called to collect * the topology tree. */ static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent) { char cpusetbuf[CPUSETBUFSIZ]; int i, first; sbuf_printf(sb, "%*s\n", indent, "", 1 + indent / 2, cg->cg_level); sbuf_printf(sb, "%*s ", indent, "", cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask)); first = TRUE; for (i = 0; i < MAXCPU; i++) { if (CPU_ISSET(i, &cg->cg_mask)) { if (!first) sbuf_printf(sb, ", "); else first = FALSE; sbuf_printf(sb, "%d", i); } } sbuf_printf(sb, "\n"); if (cg->cg_flags != 0) { sbuf_printf(sb, "%*s ", indent, ""); if ((cg->cg_flags & CG_FLAG_HTT) != 0) sbuf_printf(sb, "HTT group"); if ((cg->cg_flags & CG_FLAG_THREAD) != 0) sbuf_printf(sb, "THREAD group"); if ((cg->cg_flags & CG_FLAG_SMT) != 0) sbuf_printf(sb, "SMT group"); sbuf_printf(sb, "\n"); } if (cg->cg_children > 0) { sbuf_printf(sb, "%*s \n", indent, ""); for (i = 0; i < cg->cg_children; i++) sysctl_kern_sched_topology_spec_internal(sb, &cg->cg_child[i], indent+2); sbuf_printf(sb, "%*s \n", indent, ""); } sbuf_printf(sb, "%*s\n", indent, ""); return (0); } /* * Sysctl handler for retrieving topology dump. It's a wrapper for * the recursive sysctl_kern_smp_topology_spec_internal(). */ static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS) { struct sbuf *topo; int err; KASSERT(cpu_top != NULL, ("cpu_top isn't initialized")); topo = sbuf_new_for_sysctl(NULL, NULL, 512, req); if (topo == NULL) return (ENOMEM); sbuf_printf(topo, "\n"); err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1); sbuf_printf(topo, "\n"); if (err == 0) { err = sbuf_finish(topo); } sbuf_delete(topo); return (err); } #endif static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0, "Maximal (lowest) priority for preemption"); SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0, "Assign static kernel priorities to sleeping threads"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, "Number of times idle thread will spin waiting for new work"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh, 0, "Threshold before we will permit idle thread spinning"); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, &balance_interval, 0, "Average period in stathz ticks to run the long-term balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, "Attempts to steal work from other cores before idling"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, "Minimum load on remote CPU before we'll steal"); SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", "XML dump of detected CPU topology"); #endif /* ps compat. All cpu percentages from ULE are weighted. */ static int ccpu = 0; SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); Index: head/sys/kern/subr_acl_nfs4.c =================================================================== --- head/sys/kern/subr_acl_nfs4.c (revision 326270) +++ head/sys/kern/subr_acl_nfs4.c (revision 326271) @@ -1,1418 +1,1420 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008-2010 Edward Tomasz NapieraÅ‚a * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * ACL support routines specific to NFSv4 access control lists. These are * utility routines for code common across file systems implementing NFSv4 * ACLs. */ #ifdef _KERNEL #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #else #include #include #include #include #define KASSERT(a, b) assert(a) #define CTASSERT(a) #endif /* !_KERNEL */ #ifdef _KERNEL static void acl_nfs4_trivial_from_mode(struct acl *aclp, mode_t mode); static int acl_nfs4_old_semantics = 0; SYSCTL_INT(_vfs, OID_AUTO, acl_nfs4_old_semantics, CTLFLAG_RW, &acl_nfs4_old_semantics, 0, "Use pre-PSARC/2010/029 NFSv4 ACL semantics"); static struct { accmode_t accmode; int mask; } accmode2mask[] = {{VREAD, ACL_READ_DATA}, {VWRITE, ACL_WRITE_DATA}, {VAPPEND, ACL_APPEND_DATA}, {VEXEC, ACL_EXECUTE}, {VREAD_NAMED_ATTRS, ACL_READ_NAMED_ATTRS}, {VWRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS}, {VDELETE_CHILD, ACL_DELETE_CHILD}, {VREAD_ATTRIBUTES, ACL_READ_ATTRIBUTES}, {VWRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES}, {VDELETE, ACL_DELETE}, {VREAD_ACL, ACL_READ_ACL}, {VWRITE_ACL, ACL_WRITE_ACL}, {VWRITE_OWNER, ACL_WRITE_OWNER}, {VSYNCHRONIZE, ACL_SYNCHRONIZE}, {0, 0}}; static int _access_mask_from_accmode(accmode_t accmode) { int access_mask = 0, i; for (i = 0; accmode2mask[i].accmode != 0; i++) { if (accmode & accmode2mask[i].accmode) access_mask |= accmode2mask[i].mask; } /* * VAPPEND is just a modifier for VWRITE; if the caller asked * for 'VAPPEND | VWRITE', we want to check for ACL_APPEND_DATA only. */ if (access_mask & ACL_APPEND_DATA) access_mask &= ~ACL_WRITE_DATA; return (access_mask); } /* * Return 0, iff access is allowed, 1 otherwise. */ static int _acl_denies(const struct acl *aclp, int access_mask, struct ucred *cred, int file_uid, int file_gid, int *denied_explicitly) { int i; const struct acl_entry *entry; if (denied_explicitly != NULL) *denied_explicitly = 0; KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES, ("aclp->acl_cnt <= ACL_MAX_ENTRIES")); for (i = 0; i < aclp->acl_cnt; i++) { entry = &(aclp->acl_entry[i]); if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW && entry->ae_entry_type != ACL_ENTRY_TYPE_DENY) continue; if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY) continue; switch (entry->ae_tag) { case ACL_USER_OBJ: if (file_uid != cred->cr_uid) continue; break; case ACL_USER: if (entry->ae_id != cred->cr_uid) continue; break; case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) continue; break; case ACL_GROUP: if (!groupmember(entry->ae_id, cred)) continue; break; default: KASSERT(entry->ae_tag == ACL_EVERYONE, ("entry->ae_tag == ACL_EVERYONE")); } if (entry->ae_entry_type == ACL_ENTRY_TYPE_DENY) { if (entry->ae_perm & access_mask) { if (denied_explicitly != NULL) *denied_explicitly = 1; return (1); } } access_mask &= ~(entry->ae_perm); if (access_mask == 0) return (0); } if (access_mask == 0) return (0); return (1); } int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred, int *privused) { accmode_t priv_granted = 0; int denied, explicitly_denied, access_mask, is_directory, must_be_owner = 0; mode_t file_mode = 0; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND | VEXPLICIT_DENY | VREAD_NAMED_ATTRS | VWRITE_NAMED_ATTRS | VDELETE_CHILD | VREAD_ATTRIBUTES | VWRITE_ATTRIBUTES | VDELETE | VREAD_ACL | VWRITE_ACL | VWRITE_OWNER | VSYNCHRONIZE)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); if (privused != NULL) *privused = 0; if (accmode & VADMIN) must_be_owner = 1; /* * Ignore VSYNCHRONIZE permission. */ accmode &= ~VSYNCHRONIZE; access_mask = _access_mask_from_accmode(accmode); if (type == VDIR) is_directory = 1; else is_directory = 0; /* * File owner is always allowed to read and write the ACL * and basic attributes. This is to prevent a situation * where user would change ACL in a way that prevents him * from undoing the change. */ if (file_uid == cred->cr_uid) access_mask &= ~(ACL_READ_ACL | ACL_WRITE_ACL | ACL_READ_ATTRIBUTES | ACL_WRITE_ATTRIBUTES); /* * Ignore append permission for regular files; use write * permission instead. */ if (!is_directory && (access_mask & ACL_APPEND_DATA)) { access_mask &= ~ACL_APPEND_DATA; access_mask |= ACL_WRITE_DATA; } denied = _acl_denies(aclp, access_mask, cred, file_uid, file_gid, &explicitly_denied); if (must_be_owner) { if (file_uid != cred->cr_uid) denied = EPERM; } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. We have to check the mode here to stay * consistent with execve(2). See the test in * exec_check_permissions(). */ acl_nfs4_sync_mode_from_acl(&file_mode, aclp); if (!denied && !is_directory && (accmode & VEXEC) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) denied = EACCES; if (!denied) return (0); /* * Access failed. Iff it was not denied explicitly and * VEXPLICIT_DENY flag was specified, allow access. */ if ((accmode & VEXPLICIT_DENY) && explicitly_denied == 0) return (0); accmode &= ~VEXPLICIT_DENY; /* * No match. Try to use privileges, if there are any. */ if (is_directory) { if ((accmode & VEXEC) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) priv_granted |= VEXEC; } if ((accmode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0)) priv_granted |= VREAD; if ((accmode & (VWRITE | VAPPEND | VDELETE_CHILD)) && !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) priv_granted |= (VWRITE | VAPPEND | VDELETE_CHILD); if ((accmode & VADMIN_PERMS) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) priv_granted |= VADMIN_PERMS; if ((accmode & VSTAT_PERMS) && !priv_check_cred(cred, PRIV_VFS_STAT, 0)) priv_granted |= VSTAT_PERMS; if ((accmode & priv_granted) == accmode) { if (privused != NULL) *privused = 1; return (0); } if (accmode & (VADMIN_PERMS | VDELETE_CHILD | VDELETE)) denied = EPERM; else denied = EACCES; return (denied); } #endif /* _KERNEL */ static int _acl_entry_matches(struct acl_entry *entry, acl_tag_t tag, acl_perm_t perm, acl_entry_type_t entry_type) { if (entry->ae_tag != tag) return (0); if (entry->ae_id != ACL_UNDEFINED_ID) return (0); if (entry->ae_perm != perm) return (0); if (entry->ae_entry_type != entry_type) return (0); if (entry->ae_flags != 0) return (0); return (1); } static struct acl_entry * _acl_append(struct acl *aclp, acl_tag_t tag, acl_perm_t perm, acl_entry_type_t entry_type) { struct acl_entry *entry; KASSERT(aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES, ("aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES")); entry = &(aclp->acl_entry[aclp->acl_cnt]); aclp->acl_cnt++; entry->ae_tag = tag; entry->ae_id = ACL_UNDEFINED_ID; entry->ae_perm = perm; entry->ae_entry_type = entry_type; entry->ae_flags = 0; return (entry); } static struct acl_entry * _acl_duplicate_entry(struct acl *aclp, int entry_index) { int i; KASSERT(aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES, ("aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES")); for (i = aclp->acl_cnt; i > entry_index; i--) aclp->acl_entry[i] = aclp->acl_entry[i - 1]; aclp->acl_cnt++; return (&(aclp->acl_entry[entry_index + 1])); } static void acl_nfs4_sync_acl_from_mode_draft(struct acl *aclp, mode_t mode, int file_owner_id) { int i, meets, must_append; struct acl_entry *entry, *copy, *previous, *a1, *a2, *a3, *a4, *a5, *a6; mode_t amode; const int READ = 04; const int WRITE = 02; const int EXEC = 01; KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES, ("aclp->acl_cnt <= ACL_MAX_ENTRIES")); /* * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt * * 3.16.6.3. Applying a Mode to an Existing ACL */ /* * 1. For each ACE: */ for (i = 0; i < aclp->acl_cnt; i++) { entry = &(aclp->acl_entry[i]); /* * 1.1. If the type is neither ALLOW or DENY - skip. */ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW && entry->ae_entry_type != ACL_ENTRY_TYPE_DENY) continue; /* * 1.2. If ACL_ENTRY_INHERIT_ONLY is set - skip. */ if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY) continue; /* * 1.3. If ACL_ENTRY_FILE_INHERIT or ACL_ENTRY_DIRECTORY_INHERIT * are set: */ if (entry->ae_flags & (ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT)) { /* * 1.3.1. A copy of the current ACE is made, and placed * in the ACL immediately following the current * ACE. */ copy = _acl_duplicate_entry(aclp, i); /* * 1.3.2. In the first ACE, the flag * ACL_ENTRY_INHERIT_ONLY is set. */ entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY; /* * 1.3.3. In the second ACE, the following flags * are cleared: * ACL_ENTRY_FILE_INHERIT, * ACL_ENTRY_DIRECTORY_INHERIT, * ACL_ENTRY_NO_PROPAGATE_INHERIT. */ copy->ae_flags &= ~(ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT | ACL_ENTRY_NO_PROPAGATE_INHERIT); /* * The algorithm continues on with the second ACE. */ i++; entry = copy; } /* * 1.4. If it's owner@, group@ or everyone@ entry, clear * ACL_READ_DATA, ACL_WRITE_DATA, ACL_APPEND_DATA * and ACL_EXECUTE. Continue to the next entry. */ if (entry->ae_tag == ACL_USER_OBJ || entry->ae_tag == ACL_GROUP_OBJ || entry->ae_tag == ACL_EVERYONE) { entry->ae_perm &= ~(ACL_READ_DATA | ACL_WRITE_DATA | ACL_APPEND_DATA | ACL_EXECUTE); continue; } /* * 1.5. Otherwise, if the "who" field did not match one * of OWNER@, GROUP@, EVERYONE@: * * 1.5.1. If the type is ALLOW, check the preceding ACE. * If it does not meet all of the following criteria: */ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW) continue; meets = 0; if (i > 0) { meets = 1; previous = &(aclp->acl_entry[i - 1]); /* * 1.5.1.1. The type field is DENY, */ if (previous->ae_entry_type != ACL_ENTRY_TYPE_DENY) meets = 0; /* * 1.5.1.2. The "who" field is the same as the current * ACE, * * 1.5.1.3. The flag bit ACE4_IDENTIFIER_GROUP * is the same as it is in the current ACE, * and no other flag bits are set, */ if (previous->ae_id != entry->ae_id || previous->ae_tag != entry->ae_tag) meets = 0; if (previous->ae_flags) meets = 0; /* * 1.5.1.4. The mask bits are a subset of the mask bits * of the current ACE, and are also subset of * the following: ACL_READ_DATA, * ACL_WRITE_DATA, ACL_APPEND_DATA, ACL_EXECUTE */ if (previous->ae_perm & ~(entry->ae_perm)) meets = 0; if (previous->ae_perm & ~(ACL_READ_DATA | ACL_WRITE_DATA | ACL_APPEND_DATA | ACL_EXECUTE)) meets = 0; } if (!meets) { /* * Then the ACE of type DENY, with a who equal * to the current ACE, flag bits equal to * ( & ) * and no mask bits, is prepended. */ previous = entry; entry = _acl_duplicate_entry(aclp, i); /* Adjust counter, as we've just added an entry. */ i++; previous->ae_tag = entry->ae_tag; previous->ae_id = entry->ae_id; previous->ae_flags = entry->ae_flags; previous->ae_perm = 0; previous->ae_entry_type = ACL_ENTRY_TYPE_DENY; } /* * 1.5.2. The following modifications are made to the prepended * ACE. The intent is to mask the following ACE * to disallow ACL_READ_DATA, ACL_WRITE_DATA, * ACL_APPEND_DATA, or ACL_EXECUTE, based upon the group * permissions of the new mode. As a special case, * if the ACE matches the current owner of the file, * the owner bits are used, rather than the group bits. * This is reflected in the algorithm below. */ amode = mode >> 3; /* * If ACE4_IDENTIFIER_GROUP is not set, and the "who" field * in ACE matches the owner of the file, we shift amode three * more bits, in order to have the owner permission bits * placed in the three low order bits of amode. */ if (entry->ae_tag == ACL_USER && entry->ae_id == file_owner_id) amode = amode >> 3; if (entry->ae_perm & ACL_READ_DATA) { if (amode & READ) previous->ae_perm &= ~ACL_READ_DATA; else previous->ae_perm |= ACL_READ_DATA; } if (entry->ae_perm & ACL_WRITE_DATA) { if (amode & WRITE) previous->ae_perm &= ~ACL_WRITE_DATA; else previous->ae_perm |= ACL_WRITE_DATA; } if (entry->ae_perm & ACL_APPEND_DATA) { if (amode & WRITE) previous->ae_perm &= ~ACL_APPEND_DATA; else previous->ae_perm |= ACL_APPEND_DATA; } if (entry->ae_perm & ACL_EXECUTE) { if (amode & EXEC) previous->ae_perm &= ~ACL_EXECUTE; else previous->ae_perm |= ACL_EXECUTE; } /* * 1.5.3. If ACE4_IDENTIFIER_GROUP is set in the flags * of the ALLOW ace: * * XXX: This point is not there in the Falkner's draft. */ if (entry->ae_tag == ACL_GROUP && entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) { mode_t extramode, ownermode; extramode = (mode >> 3) & 07; ownermode = mode >> 6; extramode &= ~ownermode; if (extramode) { if (extramode & READ) { entry->ae_perm &= ~ACL_READ_DATA; previous->ae_perm &= ~ACL_READ_DATA; } if (extramode & WRITE) { entry->ae_perm &= ~(ACL_WRITE_DATA | ACL_APPEND_DATA); previous->ae_perm &= ~(ACL_WRITE_DATA | ACL_APPEND_DATA); } if (extramode & EXEC) { entry->ae_perm &= ~ACL_EXECUTE; previous->ae_perm &= ~ACL_EXECUTE; } } } } /* * 2. If there at least six ACEs, the final six ACEs are examined. * If they are not equal to what we want, append six ACEs. */ must_append = 0; if (aclp->acl_cnt < 6) { must_append = 1; } else { a6 = &(aclp->acl_entry[aclp->acl_cnt - 1]); a5 = &(aclp->acl_entry[aclp->acl_cnt - 2]); a4 = &(aclp->acl_entry[aclp->acl_cnt - 3]); a3 = &(aclp->acl_entry[aclp->acl_cnt - 4]); a2 = &(aclp->acl_entry[aclp->acl_cnt - 5]); a1 = &(aclp->acl_entry[aclp->acl_cnt - 6]); if (!_acl_entry_matches(a1, ACL_USER_OBJ, 0, ACL_ENTRY_TYPE_DENY)) must_append = 1; if (!_acl_entry_matches(a2, ACL_USER_OBJ, ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_ALLOW)) must_append = 1; if (!_acl_entry_matches(a3, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_DENY)) must_append = 1; if (!_acl_entry_matches(a4, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_ALLOW)) must_append = 1; if (!_acl_entry_matches(a5, ACL_EVERYONE, ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_DENY)) must_append = 1; if (!_acl_entry_matches(a6, ACL_EVERYONE, ACL_READ_ACL | ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS | ACL_SYNCHRONIZE, ACL_ENTRY_TYPE_ALLOW)) must_append = 1; } if (must_append) { KASSERT(aclp->acl_cnt + 6 <= ACL_MAX_ENTRIES, ("aclp->acl_cnt <= ACL_MAX_ENTRIES")); a1 = _acl_append(aclp, ACL_USER_OBJ, 0, ACL_ENTRY_TYPE_DENY); a2 = _acl_append(aclp, ACL_USER_OBJ, ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_ALLOW); a3 = _acl_append(aclp, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_DENY); a4 = _acl_append(aclp, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_ALLOW); a5 = _acl_append(aclp, ACL_EVERYONE, ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_DENY); a6 = _acl_append(aclp, ACL_EVERYONE, ACL_READ_ACL | ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS | ACL_SYNCHRONIZE, ACL_ENTRY_TYPE_ALLOW); KASSERT(a1 != NULL && a2 != NULL && a3 != NULL && a4 != NULL && a5 != NULL && a6 != NULL, ("couldn't append to ACL.")); } /* * 3. The final six ACEs are adjusted according to the incoming mode. */ if (mode & S_IRUSR) a2->ae_perm |= ACL_READ_DATA; else a1->ae_perm |= ACL_READ_DATA; if (mode & S_IWUSR) a2->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA); else a1->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA); if (mode & S_IXUSR) a2->ae_perm |= ACL_EXECUTE; else a1->ae_perm |= ACL_EXECUTE; if (mode & S_IRGRP) a4->ae_perm |= ACL_READ_DATA; else a3->ae_perm |= ACL_READ_DATA; if (mode & S_IWGRP) a4->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA); else a3->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA); if (mode & S_IXGRP) a4->ae_perm |= ACL_EXECUTE; else a3->ae_perm |= ACL_EXECUTE; if (mode & S_IROTH) a6->ae_perm |= ACL_READ_DATA; else a5->ae_perm |= ACL_READ_DATA; if (mode & S_IWOTH) a6->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA); else a5->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA); if (mode & S_IXOTH) a6->ae_perm |= ACL_EXECUTE; else a5->ae_perm |= ACL_EXECUTE; } #ifdef _KERNEL void acl_nfs4_sync_acl_from_mode(struct acl *aclp, mode_t mode, int file_owner_id) { if (acl_nfs4_old_semantics) acl_nfs4_sync_acl_from_mode_draft(aclp, mode, file_owner_id); else acl_nfs4_trivial_from_mode(aclp, mode); } #endif /* _KERNEL */ void acl_nfs4_sync_mode_from_acl(mode_t *_mode, const struct acl *aclp) { int i; mode_t old_mode = *_mode, mode = 0, seen = 0; const struct acl_entry *entry; KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES, ("aclp->acl_cnt <= ACL_MAX_ENTRIES")); /* * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt * * 3.16.6.1. Recomputing mode upon SETATTR of ACL */ for (i = 0; i < aclp->acl_cnt; i++) { entry = &(aclp->acl_entry[i]); if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW && entry->ae_entry_type != ACL_ENTRY_TYPE_DENY) continue; if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY) continue; if (entry->ae_tag == ACL_USER_OBJ) { if ((entry->ae_perm & ACL_READ_DATA) && ((seen & S_IRUSR) == 0)) { seen |= S_IRUSR; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IRUSR; } if ((entry->ae_perm & ACL_WRITE_DATA) && ((seen & S_IWUSR) == 0)) { seen |= S_IWUSR; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IWUSR; } if ((entry->ae_perm & ACL_EXECUTE) && ((seen & S_IXUSR) == 0)) { seen |= S_IXUSR; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IXUSR; } } else if (entry->ae_tag == ACL_GROUP_OBJ) { if ((entry->ae_perm & ACL_READ_DATA) && ((seen & S_IRGRP) == 0)) { seen |= S_IRGRP; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IRGRP; } if ((entry->ae_perm & ACL_WRITE_DATA) && ((seen & S_IWGRP) == 0)) { seen |= S_IWGRP; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IWGRP; } if ((entry->ae_perm & ACL_EXECUTE) && ((seen & S_IXGRP) == 0)) { seen |= S_IXGRP; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IXGRP; } } else if (entry->ae_tag == ACL_EVERYONE) { if (entry->ae_perm & ACL_READ_DATA) { if ((seen & S_IRUSR) == 0) { seen |= S_IRUSR; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IRUSR; } if ((seen & S_IRGRP) == 0) { seen |= S_IRGRP; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IRGRP; } if ((seen & S_IROTH) == 0) { seen |= S_IROTH; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IROTH; } } if (entry->ae_perm & ACL_WRITE_DATA) { if ((seen & S_IWUSR) == 0) { seen |= S_IWUSR; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IWUSR; } if ((seen & S_IWGRP) == 0) { seen |= S_IWGRP; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IWGRP; } if ((seen & S_IWOTH) == 0) { seen |= S_IWOTH; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IWOTH; } } if (entry->ae_perm & ACL_EXECUTE) { if ((seen & S_IXUSR) == 0) { seen |= S_IXUSR; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IXUSR; } if ((seen & S_IXGRP) == 0) { seen |= S_IXGRP; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IXGRP; } if ((seen & S_IXOTH) == 0) { seen |= S_IXOTH; if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) mode |= S_IXOTH; } } } } *_mode = mode | (old_mode & ACL_PRESERVE_MASK); } #ifdef _KERNEL /* * Calculate inherited ACL in a manner compatible with NFSv4 Minor Version 1, * draft-ietf-nfsv4-minorversion1-03.txt. */ static void acl_nfs4_compute_inherited_acl_draft(const struct acl *parent_aclp, struct acl *child_aclp, mode_t mode, int file_owner_id, int is_directory) { int i, flags; const struct acl_entry *parent_entry; struct acl_entry *entry, *copy; KASSERT(child_aclp->acl_cnt == 0, ("child_aclp->acl_cnt == 0")); KASSERT(parent_aclp->acl_cnt <= ACL_MAX_ENTRIES, ("parent_aclp->acl_cnt <= ACL_MAX_ENTRIES")); /* * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt * * 3.16.6.2. Applying the mode given to CREATE or OPEN * to an inherited ACL */ /* * 1. Form an ACL that is the concatenation of all inheritable ACEs. */ for (i = 0; i < parent_aclp->acl_cnt; i++) { parent_entry = &(parent_aclp->acl_entry[i]); flags = parent_entry->ae_flags; /* * Entry is not inheritable at all. */ if ((flags & (ACL_ENTRY_DIRECTORY_INHERIT | ACL_ENTRY_FILE_INHERIT)) == 0) continue; /* * We're creating a file, but entry is not inheritable * by files. */ if (!is_directory && (flags & ACL_ENTRY_FILE_INHERIT) == 0) continue; /* * Entry is inheritable only by files, but has NO_PROPAGATE * flag set, and we're creating a directory, so it wouldn't * propagate to any file in that directory anyway. */ if (is_directory && (flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0 && (flags & ACL_ENTRY_NO_PROPAGATE_INHERIT)) continue; KASSERT(child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES, ("child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES")); child_aclp->acl_entry[child_aclp->acl_cnt] = *parent_entry; child_aclp->acl_cnt++; } /* * 2. For each entry in the new ACL, adjust its flags, possibly * creating two entries in place of one. */ for (i = 0; i < child_aclp->acl_cnt; i++) { entry = &(child_aclp->acl_entry[i]); /* * This is not in the specification, but SunOS * apparently does that. */ if (((entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT) || !is_directory) && entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER); /* * 2.A. If the ACL_ENTRY_NO_PROPAGATE_INHERIT is set, or if the object * being created is not a directory, then clear the * following flags: ACL_ENTRY_NO_PROPAGATE_INHERIT, * ACL_ENTRY_FILE_INHERIT, ACL_ENTRY_DIRECTORY_INHERIT, * ACL_ENTRY_INHERIT_ONLY. */ if (entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT || !is_directory) { entry->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT | ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT | ACL_ENTRY_INHERIT_ONLY); /* * Continue on to the next ACE. */ continue; } /* * 2.B. If the object is a directory and ACL_ENTRY_FILE_INHERIT * is set, but ACL_ENTRY_NO_PROPAGATE_INHERIT is not set, ensure * that ACL_ENTRY_INHERIT_ONLY is set. Continue to the * next ACE. Otherwise... */ /* * XXX: Read it again and make sure what does the "otherwise" * apply to. */ if (is_directory && (entry->ae_flags & ACL_ENTRY_FILE_INHERIT) && ((entry->ae_flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0)) { entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY; continue; } /* * 2.C. If the type of the ACE is neither ALLOW nor deny, * then continue. */ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW && entry->ae_entry_type != ACL_ENTRY_TYPE_DENY) continue; /* * 2.D. Copy the original ACE into a second, adjacent ACE. */ copy = _acl_duplicate_entry(child_aclp, i); /* * 2.E. On the first ACE, ensure that ACL_ENTRY_INHERIT_ONLY * is set. */ entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY; /* * 2.F. On the second ACE, clear the following flags: * ACL_ENTRY_NO_PROPAGATE_INHERIT, ACL_ENTRY_FILE_INHERIT, * ACL_ENTRY_DIRECTORY_INHERIT, ACL_ENTRY_INHERIT_ONLY. */ copy->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT | ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT | ACL_ENTRY_INHERIT_ONLY); /* * 2.G. On the second ACE, if the type is ALLOW, * an implementation MAY clear the following * mask bits: ACL_WRITE_ACL, ACL_WRITE_OWNER. */ if (copy->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) copy->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER); /* * Increment the counter to skip the copied entry. */ i++; } /* * 3. To ensure that the mode is honored, apply the algorithm describe * in Section 2.16.6.3, using the mode that is to be used for file * creation. */ acl_nfs4_sync_acl_from_mode(child_aclp, mode, file_owner_id); } #endif /* _KERNEL */ /* * Populate the ACL with entries inherited from parent_aclp. */ static void acl_nfs4_inherit_entries(const struct acl *parent_aclp, struct acl *child_aclp, mode_t mode, int file_owner_id, int is_directory) { int i, flags, tag; const struct acl_entry *parent_entry; struct acl_entry *entry; KASSERT(parent_aclp->acl_cnt <= ACL_MAX_ENTRIES, ("parent_aclp->acl_cnt <= ACL_MAX_ENTRIES")); for (i = 0; i < parent_aclp->acl_cnt; i++) { parent_entry = &(parent_aclp->acl_entry[i]); flags = parent_entry->ae_flags; tag = parent_entry->ae_tag; /* * Don't inherit owner@, group@, or everyone@ entries. */ if (tag == ACL_USER_OBJ || tag == ACL_GROUP_OBJ || tag == ACL_EVERYONE) continue; /* * Entry is not inheritable at all. */ if ((flags & (ACL_ENTRY_DIRECTORY_INHERIT | ACL_ENTRY_FILE_INHERIT)) == 0) continue; /* * We're creating a file, but entry is not inheritable * by files. */ if (!is_directory && (flags & ACL_ENTRY_FILE_INHERIT) == 0) continue; /* * Entry is inheritable only by files, but has NO_PROPAGATE * flag set, and we're creating a directory, so it wouldn't * propagate to any file in that directory anyway. */ if (is_directory && (flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0 && (flags & ACL_ENTRY_NO_PROPAGATE_INHERIT)) continue; /* * Entry qualifies for being inherited. */ KASSERT(child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES, ("child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES")); entry = &(child_aclp->acl_entry[child_aclp->acl_cnt]); *entry = *parent_entry; child_aclp->acl_cnt++; entry->ae_flags &= ~ACL_ENTRY_INHERIT_ONLY; entry->ae_flags |= ACL_ENTRY_INHERITED; /* * If the type of the ACE is neither ALLOW nor DENY, * then leave it as it is and proceed to the next one. */ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW && entry->ae_entry_type != ACL_ENTRY_TYPE_DENY) continue; /* * If the ACL_ENTRY_NO_PROPAGATE_INHERIT is set, or if * the object being created is not a directory, then clear * the following flags: ACL_ENTRY_NO_PROPAGATE_INHERIT, * ACL_ENTRY_FILE_INHERIT, ACL_ENTRY_DIRECTORY_INHERIT, * ACL_ENTRY_INHERIT_ONLY. */ if (entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT || !is_directory) { entry->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT | ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT | ACL_ENTRY_INHERIT_ONLY); } /* * If the object is a directory and ACL_ENTRY_FILE_INHERIT * is set, but ACL_ENTRY_DIRECTORY_INHERIT is not set, ensure * that ACL_ENTRY_INHERIT_ONLY is set. */ if (is_directory && (entry->ae_flags & ACL_ENTRY_FILE_INHERIT) && ((entry->ae_flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0)) { entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY; } if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW && (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY) == 0) { /* * Some permissions must never be inherited. */ entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_NAMED_ATTRS | ACL_WRITE_ATTRIBUTES); /* * Others must be masked according to the file mode. */ if ((mode & S_IRGRP) == 0) entry->ae_perm &= ~ACL_READ_DATA; if ((mode & S_IWGRP) == 0) entry->ae_perm &= ~(ACL_WRITE_DATA | ACL_APPEND_DATA); if ((mode & S_IXGRP) == 0) entry->ae_perm &= ~ACL_EXECUTE; } } } /* * Calculate inherited ACL in a manner compatible with PSARC/2010/029. * It's also being used to calculate a trivial ACL, by inheriting from * a NULL ACL. */ static void acl_nfs4_compute_inherited_acl_psarc(const struct acl *parent_aclp, struct acl *aclp, mode_t mode, int file_owner_id, int is_directory) { acl_perm_t user_allow_first = 0, user_deny = 0, group_deny = 0; acl_perm_t user_allow, group_allow, everyone_allow; KASSERT(aclp->acl_cnt == 0, ("aclp->acl_cnt == 0")); user_allow = group_allow = everyone_allow = ACL_READ_ACL | ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS | ACL_SYNCHRONIZE; user_allow |= ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS; if (mode & S_IRUSR) user_allow |= ACL_READ_DATA; if (mode & S_IWUSR) user_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA); if (mode & S_IXUSR) user_allow |= ACL_EXECUTE; if (mode & S_IRGRP) group_allow |= ACL_READ_DATA; if (mode & S_IWGRP) group_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA); if (mode & S_IXGRP) group_allow |= ACL_EXECUTE; if (mode & S_IROTH) everyone_allow |= ACL_READ_DATA; if (mode & S_IWOTH) everyone_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA); if (mode & S_IXOTH) everyone_allow |= ACL_EXECUTE; user_deny = ((group_allow | everyone_allow) & ~user_allow); group_deny = everyone_allow & ~group_allow; user_allow_first = group_deny & ~user_deny; if (user_allow_first != 0) _acl_append(aclp, ACL_USER_OBJ, user_allow_first, ACL_ENTRY_TYPE_ALLOW); if (user_deny != 0) _acl_append(aclp, ACL_USER_OBJ, user_deny, ACL_ENTRY_TYPE_DENY); if (group_deny != 0) _acl_append(aclp, ACL_GROUP_OBJ, group_deny, ACL_ENTRY_TYPE_DENY); if (parent_aclp != NULL) acl_nfs4_inherit_entries(parent_aclp, aclp, mode, file_owner_id, is_directory); _acl_append(aclp, ACL_USER_OBJ, user_allow, ACL_ENTRY_TYPE_ALLOW); _acl_append(aclp, ACL_GROUP_OBJ, group_allow, ACL_ENTRY_TYPE_ALLOW); _acl_append(aclp, ACL_EVERYONE, everyone_allow, ACL_ENTRY_TYPE_ALLOW); } #ifdef _KERNEL void acl_nfs4_compute_inherited_acl(const struct acl *parent_aclp, struct acl *child_aclp, mode_t mode, int file_owner_id, int is_directory) { if (acl_nfs4_old_semantics) acl_nfs4_compute_inherited_acl_draft(parent_aclp, child_aclp, mode, file_owner_id, is_directory); else acl_nfs4_compute_inherited_acl_psarc(parent_aclp, child_aclp, mode, file_owner_id, is_directory); } #endif /* _KERNEL */ /* * Calculate trivial ACL in a manner compatible with PSARC/2010/029. * Note that this results in an ACL different from (but semantically * equal to) the "canonical six" trivial ACL computed using algorithm * described in draft-ietf-nfsv4-minorversion1-03.txt, 3.16.6.2. */ static void acl_nfs4_trivial_from_mode(struct acl *aclp, mode_t mode) { aclp->acl_cnt = 0; acl_nfs4_compute_inherited_acl_psarc(NULL, aclp, mode, -1, -1); } #ifndef _KERNEL /* * This routine is used by libc to implement acl_strip_np(3) * and acl_is_trivial_np(3). */ void acl_nfs4_trivial_from_mode_libc(struct acl *aclp, int mode, int canonical_six) { aclp->acl_cnt = 0; if (canonical_six) acl_nfs4_sync_acl_from_mode_draft(aclp, mode, -1); else acl_nfs4_trivial_from_mode(aclp, mode); } #endif /* !_KERNEL */ #ifdef _KERNEL static int _acls_are_equal(const struct acl *a, const struct acl *b) { int i; const struct acl_entry *entrya, *entryb; if (a->acl_cnt != b->acl_cnt) return (0); for (i = 0; i < b->acl_cnt; i++) { entrya = &(a->acl_entry[i]); entryb = &(b->acl_entry[i]); if (entrya->ae_tag != entryb->ae_tag || entrya->ae_id != entryb->ae_id || entrya->ae_perm != entryb->ae_perm || entrya->ae_entry_type != entryb->ae_entry_type || entrya->ae_flags != entryb->ae_flags) return (0); } return (1); } /* * This routine is used to determine whether to remove extended attribute * that stores ACL contents. */ int acl_nfs4_is_trivial(const struct acl *aclp, int file_owner_id) { int trivial; mode_t tmpmode = 0; struct acl *tmpaclp; if (aclp->acl_cnt > 6) return (0); /* * Compute the mode from the ACL, then compute new ACL from that mode. * If the ACLs are identical, then the ACL is trivial. * * XXX: I guess there is a faster way to do this. However, even * this slow implementation significantly speeds things up * for files that don't have non-trivial ACLs - it's critical * for performance to not use EA when they are not needed. * * First try the PSARC/2010/029 semantics. */ tmpaclp = acl_alloc(M_WAITOK | M_ZERO); acl_nfs4_sync_mode_from_acl(&tmpmode, aclp); acl_nfs4_trivial_from_mode(tmpaclp, tmpmode); trivial = _acls_are_equal(aclp, tmpaclp); if (trivial) { acl_free(tmpaclp); return (trivial); } /* * Check if it's a draft-ietf-nfsv4-minorversion1-03.txt trivial ACL. */ tmpaclp->acl_cnt = 0; acl_nfs4_sync_acl_from_mode_draft(tmpaclp, tmpmode, file_owner_id); trivial = _acls_are_equal(aclp, tmpaclp); acl_free(tmpaclp); return (trivial); } #endif /* _KERNEL */ int acl_nfs4_check(const struct acl *aclp, int is_directory) { int i; const struct acl_entry *entry; /* * The spec doesn't seem to say anything about ACL validity. * It seems there is not much to do here. There is even no need * to count "owner@" or "everyone@" (ACL_USER_OBJ and ACL_EVERYONE) * entries, as there can be several of them and that's perfectly * valid. There can be none of them too. Really. */ if (aclp->acl_cnt > ACL_MAX_ENTRIES || aclp->acl_cnt <= 0) return (EINVAL); for (i = 0; i < aclp->acl_cnt; i++) { entry = &(aclp->acl_entry[i]); switch (entry->ae_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_EVERYONE: if (entry->ae_id != ACL_UNDEFINED_ID) return (EINVAL); break; case ACL_USER: case ACL_GROUP: if (entry->ae_id == ACL_UNDEFINED_ID) return (EINVAL); break; default: return (EINVAL); } if ((entry->ae_perm | ACL_NFS4_PERM_BITS) != ACL_NFS4_PERM_BITS) return (EINVAL); /* * Disallow ACL_ENTRY_TYPE_AUDIT and ACL_ENTRY_TYPE_ALARM for now. */ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW && entry->ae_entry_type != ACL_ENTRY_TYPE_DENY) return (EINVAL); if ((entry->ae_flags | ACL_FLAGS_BITS) != ACL_FLAGS_BITS) return (EINVAL); /* Disallow unimplemented flags. */ if (entry->ae_flags & (ACL_ENTRY_SUCCESSFUL_ACCESS | ACL_ENTRY_FAILED_ACCESS)) return (EINVAL); /* Disallow flags not allowed for ordinary files. */ if (!is_directory) { if (entry->ae_flags & (ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT | ACL_ENTRY_NO_PROPAGATE_INHERIT | ACL_ENTRY_INHERIT_ONLY)) return (EINVAL); } } return (0); } #ifdef _KERNEL static int acl_nfs4_modload(module_t module, int what, void *arg) { int ret; ret = 0; switch (what) { case MOD_LOAD: case MOD_SHUTDOWN: break; case MOD_QUIESCE: /* XXX TODO */ ret = 0; break; case MOD_UNLOAD: /* XXX TODO */ ret = 0; break; default: ret = EINVAL; break; } return (ret); } static moduledata_t acl_nfs4_mod = { "acl_nfs4", acl_nfs4_modload, NULL }; /* * XXX TODO: which subsystem, order? */ DECLARE_MODULE(acl_nfs4, acl_nfs4_mod, SI_SUB_VFS, SI_ORDER_FIRST); MODULE_VERSION(acl_nfs4, 1); #endif /* _KERNEL */ Index: head/sys/kern/subr_acl_posix1e.c =================================================================== --- head/sys/kern/subr_acl_posix1e.c (revision 326270) +++ head/sys/kern/subr_acl_posix1e.c (revision 326271) @@ -1,691 +1,693 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999-2006 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Developed by the TrustedBSD Project. * * ACL support routines specific to POSIX.1e access control lists. These are * utility routines for code common across file systems implementing POSIX.1e * ACLs. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include /* * Implement a version of vaccess() that understands POSIX.1e ACL semantics; * the access ACL has already been prepared for evaluation by the file system * and is passed via 'uid', 'gid', and 'acl'. Return 0 on success, else an * errno value. */ int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred, int *privused) { struct acl_entry *acl_other, *acl_mask; accmode_t dac_granted; accmode_t priv_granted; accmode_t acl_mask_granted; int group_matched, i; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. Otherwise, attempt to * use privileges granted via priv_granted. In some cases, which * privileges to use may be ambiguous due to "best match", in which * case fall back on first match for the time being. */ if (privused != NULL) *privused = 0; /* * Determine privileges now, but don't apply until we've found a DAC * entry that matches but has failed to allow access. * * XXXRW: Ideally, we'd determine the privileges required before * asking for them. */ priv_granted = 0; if (type == VDIR) { if ((accmode & VEXEC) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && (acl_posix1e_acl_to_mode(acl) & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) priv_granted |= VEXEC; } if ((accmode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0)) priv_granted |= VREAD; if (((accmode & VWRITE) || (accmode & VAPPEND)) && !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) priv_granted |= VADMIN; /* * The owner matches if the effective uid associated with the * credential matches that of the ACL_USER_OBJ entry. While we're * doing the first scan, also cache the location of the ACL_MASK and * ACL_OTHER entries, preventing some future iterations. */ acl_mask = acl_other = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: if (file_uid != cred->cr_uid) break; dac_granted = 0; dac_granted |= VADMIN; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); /* * XXXRW: Do privilege lookup here. */ if ((accmode & (dac_granted | priv_granted)) == accmode) { if (privused != NULL) *privused = 1; return (0); } goto error; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; default: break; } } /* * An ACL_OTHER entry should always exist in a valid access ACL. If * it doesn't, then generate a serious failure. For now, this means * a debugging message and EPERM, but in the future should probably * be a panic. */ if (acl_other == NULL) { /* * XXX This should never happen */ printf("vaccess_acl_posix1e: ACL_OTHER missing\n"); return (EPERM); } /* * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are * masked by an ACL_MASK entry, if any. As such, first identify the * ACL_MASK field, then iterate through identifying potential user * matches, then group matches. If there is no ACL_MASK, assume that * the mask allows all requests to succeed. */ if (acl_mask != NULL) { acl_mask_granted = 0; if (acl_mask->ae_perm & ACL_EXECUTE) acl_mask_granted |= VEXEC; if (acl_mask->ae_perm & ACL_READ) acl_mask_granted |= VREAD; if (acl_mask->ae_perm & ACL_WRITE) acl_mask_granted |= (VWRITE | VAPPEND); } else acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND; /* * Check ACL_USER ACL entries. There will either be one or no * matches; if there is one, we accept or rejected based on the * match; otherwise, we continue on to groups. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER: if (acl->acl_entry[i].ae_id != cred->cr_uid) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; if ((accmode & dac_granted) == accmode) return (0); /* * XXXRW: Do privilege lookup here. */ if ((accmode & (dac_granted | priv_granted)) != accmode) goto error; if (privused != NULL) *privused = 1; return (0); } } /* * Group match is best-match, not first-match, so find a "best" * match. Iterate across, testing each potential group match. Make * sure we keep track of whether we found a match or not, so that we * know if we should try again with any available privilege, or if we * should move on to ACL_OTHER. */ group_matched = 0; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; if ((accmode & dac_granted) == accmode) return (0); group_matched = 1; break; case ACL_GROUP: if (!groupmember(acl->acl_entry[i].ae_id, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; if ((accmode & dac_granted) == accmode) return (0); group_matched = 1; break; default: break; } } if (group_matched == 1) { /* * There was a match, but it did not grant rights via pure * DAC. Try again, this time with privilege. */ for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_GROUP_OBJ: if (!groupmember(file_gid, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; /* * XXXRW: Do privilege lookup here. */ if ((accmode & (dac_granted | priv_granted)) != accmode) break; if (privused != NULL) *privused = 1; return (0); case ACL_GROUP: if (!groupmember(acl->acl_entry[i].ae_id, cred)) break; dac_granted = 0; if (acl->acl_entry[i].ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl->acl_entry[i].ae_perm & ACL_READ) dac_granted |= VREAD; if (acl->acl_entry[i].ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); dac_granted &= acl_mask_granted; /* * XXXRW: Do privilege lookup here. */ if ((accmode & (dac_granted | priv_granted)) != accmode) break; if (privused != NULL) *privused = 1; return (0); default: break; } } /* * Even with privilege, group membership was not sufficient. * Return failure. */ goto error; } /* * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER. */ dac_granted = 0; if (acl_other->ae_perm & ACL_EXECUTE) dac_granted |= VEXEC; if (acl_other->ae_perm & ACL_READ) dac_granted |= VREAD; if (acl_other->ae_perm & ACL_WRITE) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); /* * XXXRW: Do privilege lookup here. */ if ((accmode & (dac_granted | priv_granted)) == accmode) { if (privused != NULL) *privused = 1; return (0); } error: return ((accmode & VADMIN) ? EPERM : EACCES); } /* * For the purposes of filesystems maintaining the _OBJ entries in an inode * with a mode_t field, this routine converts a mode_t entry to an * acl_perm_t. */ acl_perm_t acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode) { acl_perm_t perm = 0; switch(tag) { case ACL_USER_OBJ: if (mode & S_IXUSR) perm |= ACL_EXECUTE; if (mode & S_IRUSR) perm |= ACL_READ; if (mode & S_IWUSR) perm |= ACL_WRITE; return (perm); case ACL_GROUP_OBJ: if (mode & S_IXGRP) perm |= ACL_EXECUTE; if (mode & S_IRGRP) perm |= ACL_READ; if (mode & S_IWGRP) perm |= ACL_WRITE; return (perm); case ACL_OTHER: if (mode & S_IXOTH) perm |= ACL_EXECUTE; if (mode & S_IROTH) perm |= ACL_READ; if (mode & S_IWOTH) perm |= ACL_WRITE; return (perm); default: printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag); return (0); } } /* * Given inode information (uid, gid, mode), return an acl entry of the * appropriate type. */ struct acl_entry acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode) { struct acl_entry acl_entry; acl_entry.ae_tag = tag; acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode); acl_entry.ae_entry_type = 0; acl_entry.ae_flags = 0; switch(tag) { case ACL_USER_OBJ: acl_entry.ae_id = uid; break; case ACL_GROUP_OBJ: acl_entry.ae_id = gid; break; case ACL_OTHER: acl_entry.ae_id = ACL_UNDEFINED_ID; break; default: acl_entry.ae_id = ACL_UNDEFINED_ID; printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag); } return (acl_entry); } /* * Utility function to generate a file mode given appropriate ACL entries. */ mode_t acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry, struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry) { mode_t mode; mode = 0; if (acl_user_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXUSR; if (acl_user_obj_entry->ae_perm & ACL_READ) mode |= S_IRUSR; if (acl_user_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWUSR; if (acl_group_obj_entry->ae_perm & ACL_EXECUTE) mode |= S_IXGRP; if (acl_group_obj_entry->ae_perm & ACL_READ) mode |= S_IRGRP; if (acl_group_obj_entry->ae_perm & ACL_WRITE) mode |= S_IWGRP; if (acl_other_entry->ae_perm & ACL_EXECUTE) mode |= S_IXOTH; if (acl_other_entry->ae_perm & ACL_READ) mode |= S_IROTH; if (acl_other_entry->ae_perm & ACL_WRITE) mode |= S_IWOTH; return (mode); } /* * Utility function to generate a file mode given a complete POSIX.1e access * ACL. Note that if the ACL is improperly formed, this may result in a * panic. */ mode_t acl_posix1e_acl_to_mode(struct acl *acl) { struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj, *acl_other; int i; /* * Find the ACL entries relevant to a POSIX permission mode. */ acl_user_obj = acl_group_obj = acl_other = acl_mask = NULL; for (i = 0; i < acl->acl_cnt; i++) { switch (acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: acl_user_obj = &acl->acl_entry[i]; break; case ACL_GROUP_OBJ: acl_group_obj = &acl->acl_entry[i]; break; case ACL_OTHER: acl_other = &acl->acl_entry[i]; break; case ACL_MASK: acl_mask = &acl->acl_entry[i]; break; case ACL_USER: case ACL_GROUP: break; default: panic("acl_posix1e_acl_to_mode: bad ae_tag"); } } if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL) panic("acl_posix1e_acl_to_mode: missing base ae_tags"); /* * POSIX.1e specifies that if there is an ACL_MASK entry, we replace * the mode "group" bits with its permissions. If there isn't, we * use the ACL_GROUP_OBJ permissions. */ if (acl_mask != NULL) return (acl_posix1e_perms_to_mode(acl_user_obj, acl_mask, acl_other)); else return (acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj, acl_other)); } /* * Perform a syntactic check of the ACL, sufficient to allow an implementing * filesystem to determine if it should accept this and rely on the POSIX.1e * ACL properties. */ int acl_posix1e_check(struct acl *acl) { int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group; int num_acl_mask, num_acl_other, i; /* * Verify that the number of entries does not exceed the maximum * defined for acl_t. * * Verify that the correct number of various sorts of ae_tags are * present: * Exactly one ACL_USER_OBJ * Exactly one ACL_GROUP_OBJ * Exactly one ACL_OTHER * If any ACL_USER or ACL_GROUP entries appear, then exactly one * ACL_MASK entry must also appear. * * Verify that all ae_perm entries are in ACL_PERM_BITS. * * Verify all ae_tag entries are understood by this implementation. * * Note: Does not check for uniqueness of qualifier (ae_id) field. */ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group = num_acl_mask = num_acl_other = 0; if (acl->acl_cnt > ACL_MAX_ENTRIES) return (EINVAL); for (i = 0; i < acl->acl_cnt; i++) { /* * Check for a valid tag. */ switch(acl->acl_entry[i].ae_tag) { case ACL_USER_OBJ: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_user_obj++; break; case ACL_GROUP_OBJ: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_group_obj++; break; case ACL_USER: if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) return (EINVAL); num_acl_user++; break; case ACL_GROUP: if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID) return (EINVAL); num_acl_group++; break; case ACL_OTHER: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_other++; break; case ACL_MASK: acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID) return (EINVAL); num_acl_mask++; break; default: return (EINVAL); } /* * Check for valid perm entries. */ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) != ACL_PERM_BITS) return (EINVAL); } if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) || (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1)) return (EINVAL); if (((num_acl_group != 0) || (num_acl_user != 0)) && (num_acl_mask != 1)) return (EINVAL); return (0); } /* * Given a requested mode for a new object, and a default ACL, combine the * two to produce a new mode. Be careful not to clear any bits that aren't * intended to be affected by the POSIX.1e ACL. Eventually, this might also * take the cmask as an argument, if we push that down into * per-filesystem-code. */ mode_t acl_posix1e_newfilemode(mode_t cmode, struct acl *dacl) { mode_t mode; mode = cmode; /* * The current composition policy is that a permission bit must be * set in *both* the ACL and the requested creation mode for it to * appear in the resulting mode/ACL. First clear any possibly * effected bits, then reconstruct. */ mode &= ACL_PRESERVE_MASK; mode |= (ACL_OVERRIDE_MASK & cmode & acl_posix1e_acl_to_mode(dacl)); return (mode); } static int acl_posix1e_modload(module_t mod, int what, void *arg) { int ret; ret = 0; switch (what) { case MOD_LOAD: case MOD_SHUTDOWN: break; case MOD_QUIESCE: /* XXX TODO */ ret = 0; break; case MOD_UNLOAD: /* XXX TODO */ ret = 0; break; default: ret = EINVAL; break; } return (ret); } static moduledata_t acl_posix1e_mod = { "acl_posix1e", acl_posix1e_modload, NULL }; DECLARE_MODULE(acl_posix1e, acl_posix1e_mod, SI_SUB_VFS, SI_ORDER_FIRST); MODULE_VERSION(acl_posix1e, 1); Index: head/sys/kern/subr_bufring.c =================================================================== --- head/sys/kern/subr_bufring.c (revision 326270) +++ head/sys/kern/subr_bufring.c (revision 326271) @@ -1,65 +1,67 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007, 2008 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include struct buf_ring * buf_ring_alloc(int count, struct malloc_type *type, int flags, struct mtx *lock) { struct buf_ring *br; KASSERT(powerof2(count), ("buf ring must be size power of 2")); br = malloc(sizeof(struct buf_ring) + count*sizeof(caddr_t), type, flags|M_ZERO); if (br == NULL) return (NULL); #ifdef DEBUG_BUFRING br->br_lock = lock; #endif br->br_prod_size = br->br_cons_size = count; br->br_prod_mask = br->br_cons_mask = count-1; br->br_prod_head = br->br_cons_head = 0; br->br_prod_tail = br->br_cons_tail = 0; return (br); } void buf_ring_free(struct buf_ring *br, struct malloc_type *type) { free(br, type); } Index: head/sys/kern/subr_bus.c =================================================================== --- head/sys/kern/subr_bus.c (revision 326270) +++ head/sys/kern/subr_bus.c (revision 326271) @@ -1,5625 +1,5627 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1997,1998,2003 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bus.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL); SYSCTL_ROOT_NODE(OID_AUTO, dev, CTLFLAG_RW, NULL, NULL); /* * Used to attach drivers to devclasses. */ typedef struct driverlink *driverlink_t; struct driverlink { kobj_class_t driver; TAILQ_ENTRY(driverlink) link; /* list of drivers in devclass */ int pass; TAILQ_ENTRY(driverlink) passlink; }; /* * Forward declarations */ typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t; typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t; typedef TAILQ_HEAD(device_list, device) device_list_t; struct devclass { TAILQ_ENTRY(devclass) link; devclass_t parent; /* parent in devclass hierarchy */ driver_list_t drivers; /* bus devclasses store drivers for bus */ char *name; device_t *devices; /* array of devices indexed by unit */ int maxunit; /* size of devices array */ int flags; #define DC_HAS_CHILDREN 1 struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; }; /** * @brief Implementation of device. */ struct device { /* * A device is a kernel object. The first field must be the * current ops table for the object. */ KOBJ_FIELDS; /* * Device hierarchy. */ TAILQ_ENTRY(device) link; /**< list of devices in parent */ TAILQ_ENTRY(device) devlink; /**< global device list membership */ device_t parent; /**< parent of this device */ device_list_t children; /**< list of child devices */ /* * Details of this device. */ driver_t *driver; /**< current driver */ devclass_t devclass; /**< current device class */ int unit; /**< current unit number */ char* nameunit; /**< name+unit e.g. foodev0 */ char* desc; /**< driver specific description */ int busy; /**< count of calls to device_busy() */ device_state_t state; /**< current device state */ uint32_t devflags; /**< api level flags for device_get_flags() */ u_int flags; /**< internal device flags */ u_int order; /**< order from device_add_child_ordered() */ void *ivars; /**< instance variables */ void *softc; /**< current driver's variables */ struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables */ struct sysctl_oid *sysctl_tree; /**< state for sysctl variables */ }; static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures"); static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc"); static void devctl2_init(void); #define DRIVERNAME(d) ((d)? d->name : "no driver") #define DEVCLANAME(d) ((d)? d->name : "no devclass") #ifdef BUS_DEBUG static int bus_debug = 1; SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RWTUN, &bus_debug, 0, "Bus debug level"); #define PDEBUG(a) if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");} #define DEVICENAME(d) ((d)? device_get_name(d): "no device") /** * Produce the indenting, indent*2 spaces plus a '.' ahead of that to * prevent syslog from deleting initial spaces */ #define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJparent ? dc->parent->name : ""; break; default: return (EINVAL); } return (SYSCTL_OUT_STR(req, value)); } static void devclass_sysctl_init(devclass_t dc) { if (dc->sysctl_tree != NULL) return; sysctl_ctx_init(&dc->sysctl_ctx); dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name, CTLFLAG_RD, NULL, ""); SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD, dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A", "parent class"); } enum { DEVICE_SYSCTL_DESC, DEVICE_SYSCTL_DRIVER, DEVICE_SYSCTL_LOCATION, DEVICE_SYSCTL_PNPINFO, DEVICE_SYSCTL_PARENT, }; static int device_sysctl_handler(SYSCTL_HANDLER_ARGS) { device_t dev = (device_t)arg1; const char *value; char *buf; int error; buf = NULL; switch (arg2) { case DEVICE_SYSCTL_DESC: value = dev->desc ? dev->desc : ""; break; case DEVICE_SYSCTL_DRIVER: value = dev->driver ? dev->driver->name : ""; break; case DEVICE_SYSCTL_LOCATION: value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO); bus_child_location_str(dev, buf, 1024); break; case DEVICE_SYSCTL_PNPINFO: value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO); bus_child_pnpinfo_str(dev, buf, 1024); break; case DEVICE_SYSCTL_PARENT: value = dev->parent ? dev->parent->nameunit : ""; break; default: return (EINVAL); } error = SYSCTL_OUT_STR(req, value); if (buf != NULL) free(buf, M_BUS); return (error); } static void device_sysctl_init(device_t dev) { devclass_t dc = dev->devclass; int domain; if (dev->sysctl_tree != NULL) return; devclass_sysctl_init(dc); sysctl_ctx_init(&dev->sysctl_ctx); dev->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&dev->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO, dev->nameunit + strlen(dc->name), CTLFLAG_RD, NULL, "", "device_index"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A", "device description"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%driver", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A", "device driver name"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%location", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A", "device location relative to parent"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%pnpinfo", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A", "device identification"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A", "parent device"); if (bus_get_domain(dev, &domain) == 0) SYSCTL_ADD_INT(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%domain", CTLFLAG_RD, NULL, domain, "NUMA domain"); } static void device_sysctl_update(device_t dev) { devclass_t dc = dev->devclass; if (dev->sysctl_tree == NULL) return; sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name)); } static void device_sysctl_fini(device_t dev) { if (dev->sysctl_tree == NULL) return; sysctl_ctx_free(&dev->sysctl_ctx); dev->sysctl_tree = NULL; } /* * /dev/devctl implementation */ /* * This design allows only one reader for /dev/devctl. This is not desirable * in the long run, but will get a lot of hair out of this implementation. * Maybe we should make this device a clonable device. * * Also note: we specifically do not attach a device to the device_t tree * to avoid potential chicken and egg problems. One could argue that all * of this belongs to the root node. One could also further argue that the * sysctl interface that we have not might more properly be an ioctl * interface, but at this stage of the game, I'm not inclined to rock that * boat. * * I'm also not sure that the SIGIO support is done correctly or not, as * I copied it from a driver that had SIGIO support that likely hasn't been * tested since 3.4 or 2.2.8! */ /* Deprecated way to adjust queue length */ static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_disable, "I", "devctl disable -- deprecated"); #define DEVCTL_DEFAULT_QUEUE_LEN 1000 static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS); static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN; SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_queue, "I", "devctl queue length"); static d_open_t devopen; static d_close_t devclose; static d_read_t devread; static d_ioctl_t devioctl; static d_poll_t devpoll; static d_kqfilter_t devkqfilter; static struct cdevsw dev_cdevsw = { .d_version = D_VERSION, .d_open = devopen, .d_close = devclose, .d_read = devread, .d_ioctl = devioctl, .d_poll = devpoll, .d_kqfilter = devkqfilter, .d_name = "devctl", }; struct dev_event_info { char *dei_data; TAILQ_ENTRY(dev_event_info) dei_link; }; TAILQ_HEAD(devq, dev_event_info); static struct dev_softc { int inuse; int nonblock; int queued; int async; struct mtx mtx; struct cv cv; struct selinfo sel; struct devq devq; struct sigio *sigio; } devsoftc; static void filt_devctl_detach(struct knote *kn); static int filt_devctl_read(struct knote *kn, long hint); struct filterops devctl_rfiltops = { .f_isfd = 1, .f_detach = filt_devctl_detach, .f_event = filt_devctl_read, }; static struct cdev *devctl_dev; static void devinit(void) { devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "devctl"); mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF); cv_init(&devsoftc.cv, "dev cv"); TAILQ_INIT(&devsoftc.devq); knlist_init_mtx(&devsoftc.sel.si_note, &devsoftc.mtx); devctl2_init(); } static int devopen(struct cdev *dev, int oflags, int devtype, struct thread *td) { mtx_lock(&devsoftc.mtx); if (devsoftc.inuse) { mtx_unlock(&devsoftc.mtx); return (EBUSY); } /* move to init */ devsoftc.inuse = 1; mtx_unlock(&devsoftc.mtx); return (0); } static int devclose(struct cdev *dev, int fflag, int devtype, struct thread *td) { mtx_lock(&devsoftc.mtx); devsoftc.inuse = 0; devsoftc.nonblock = 0; devsoftc.async = 0; cv_broadcast(&devsoftc.cv); funsetown(&devsoftc.sigio); mtx_unlock(&devsoftc.mtx); return (0); } /* * The read channel for this device is used to report changes to * userland in realtime. We are required to free the data as well as * the n1 object because we allocate them separately. Also note that * we return one record at a time. If you try to read this device a * character at a time, you will lose the rest of the data. Listening * programs are expected to cope. */ static int devread(struct cdev *dev, struct uio *uio, int ioflag) { struct dev_event_info *n1; int rv; mtx_lock(&devsoftc.mtx); while (TAILQ_EMPTY(&devsoftc.devq)) { if (devsoftc.nonblock) { mtx_unlock(&devsoftc.mtx); return (EAGAIN); } rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx); if (rv) { /* * Need to translate ERESTART to EINTR here? -- jake */ mtx_unlock(&devsoftc.mtx); return (rv); } } n1 = TAILQ_FIRST(&devsoftc.devq); TAILQ_REMOVE(&devsoftc.devq, n1, dei_link); devsoftc.queued--; mtx_unlock(&devsoftc.mtx); rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio); free(n1->dei_data, M_BUS); free(n1, M_BUS); return (rv); } static int devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { switch (cmd) { case FIONBIO: if (*(int*)data) devsoftc.nonblock = 1; else devsoftc.nonblock = 0; return (0); case FIOASYNC: if (*(int*)data) devsoftc.async = 1; else devsoftc.async = 0; return (0); case FIOSETOWN: return fsetown(*(int *)data, &devsoftc.sigio); case FIOGETOWN: *(int *)data = fgetown(&devsoftc.sigio); return (0); /* (un)Support for other fcntl() calls. */ case FIOCLEX: case FIONCLEX: case FIONREAD: default: break; } return (ENOTTY); } static int devpoll(struct cdev *dev, int events, struct thread *td) { int revents = 0; mtx_lock(&devsoftc.mtx); if (events & (POLLIN | POLLRDNORM)) { if (!TAILQ_EMPTY(&devsoftc.devq)) revents = events & (POLLIN | POLLRDNORM); else selrecord(td, &devsoftc.sel); } mtx_unlock(&devsoftc.mtx); return (revents); } static int devkqfilter(struct cdev *dev, struct knote *kn) { int error; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &devctl_rfiltops; knlist_add(&devsoftc.sel.si_note, kn, 0); error = 0; } else error = EINVAL; return (error); } static void filt_devctl_detach(struct knote *kn) { knlist_remove(&devsoftc.sel.si_note, kn, 0); } static int filt_devctl_read(struct knote *kn, long hint) { kn->kn_data = devsoftc.queued; return (kn->kn_data != 0); } /** * @brief Return whether the userland process is running */ boolean_t devctl_process_running(void) { return (devsoftc.inuse == 1); } /** * @brief Queue data to be read from the devctl device * * Generic interface to queue data to the devctl device. It is * assumed that @p data is properly formatted. It is further assumed * that @p data is allocated using the M_BUS malloc type. */ void devctl_queue_data_f(char *data, int flags) { struct dev_event_info *n1 = NULL, *n2 = NULL; if (strlen(data) == 0) goto out; if (devctl_queue_length == 0) goto out; n1 = malloc(sizeof(*n1), M_BUS, flags); if (n1 == NULL) goto out; n1->dei_data = data; mtx_lock(&devsoftc.mtx); if (devctl_queue_length == 0) { mtx_unlock(&devsoftc.mtx); free(n1->dei_data, M_BUS); free(n1, M_BUS); return; } /* Leave at least one spot in the queue... */ while (devsoftc.queued > devctl_queue_length - 1) { n2 = TAILQ_FIRST(&devsoftc.devq); TAILQ_REMOVE(&devsoftc.devq, n2, dei_link); free(n2->dei_data, M_BUS); free(n2, M_BUS); devsoftc.queued--; } TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link); devsoftc.queued++; cv_broadcast(&devsoftc.cv); KNOTE_LOCKED(&devsoftc.sel.si_note, 0); mtx_unlock(&devsoftc.mtx); selwakeup(&devsoftc.sel); if (devsoftc.async && devsoftc.sigio != NULL) pgsigio(&devsoftc.sigio, SIGIO, 0); return; out: /* * We have to free data on all error paths since the caller * assumes it will be free'd when this item is dequeued. */ free(data, M_BUS); return; } void devctl_queue_data(char *data) { devctl_queue_data_f(data, M_NOWAIT); } /** * @brief Send a 'notification' to userland, using standard ways */ void devctl_notify_f(const char *system, const char *subsystem, const char *type, const char *data, int flags) { int len = 0; char *msg; if (system == NULL) return; /* BOGUS! Must specify system. */ if (subsystem == NULL) return; /* BOGUS! Must specify subsystem. */ if (type == NULL) return; /* BOGUS! Must specify type. */ len += strlen(" system=") + strlen(system); len += strlen(" subsystem=") + strlen(subsystem); len += strlen(" type=") + strlen(type); /* add in the data message plus newline. */ if (data != NULL) len += strlen(data); len += 3; /* '!', '\n', and NUL */ msg = malloc(len, M_BUS, flags); if (msg == NULL) return; /* Drop it on the floor */ if (data != NULL) snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n", system, subsystem, type, data); else snprintf(msg, len, "!system=%s subsystem=%s type=%s\n", system, subsystem, type); devctl_queue_data_f(msg, flags); } void devctl_notify(const char *system, const char *subsystem, const char *type, const char *data) { devctl_notify_f(system, subsystem, type, data, M_NOWAIT); } /* * Common routine that tries to make sending messages as easy as possible. * We allocate memory for the data, copy strings into that, but do not * free it unless there's an error. The dequeue part of the driver should * free the data. We don't send data when the device is disabled. We do * send data, even when we have no listeners, because we wish to avoid * races relating to startup and restart of listening applications. * * devaddq is designed to string together the type of event, with the * object of that event, plus the plug and play info and location info * for that event. This is likely most useful for devices, but less * useful for other consumers of this interface. Those should use * the devctl_queue_data() interface instead. */ static void devaddq(const char *type, const char *what, device_t dev) { char *data = NULL; char *loc = NULL; char *pnp = NULL; const char *parstr; if (!devctl_queue_length)/* Rare race, but lost races safely discard */ return; data = malloc(1024, M_BUS, M_NOWAIT); if (data == NULL) goto bad; /* get the bus specific location of this device */ loc = malloc(1024, M_BUS, M_NOWAIT); if (loc == NULL) goto bad; *loc = '\0'; bus_child_location_str(dev, loc, 1024); /* Get the bus specific pnp info of this device */ pnp = malloc(1024, M_BUS, M_NOWAIT); if (pnp == NULL) goto bad; *pnp = '\0'; bus_child_pnpinfo_str(dev, pnp, 1024); /* Get the parent of this device, or / if high enough in the tree. */ if (device_get_parent(dev) == NULL) parstr = "."; /* Or '/' ? */ else parstr = device_get_nameunit(device_get_parent(dev)); /* String it all together. */ snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp, parstr); free(loc, M_BUS); free(pnp, M_BUS); devctl_queue_data(data); return; bad: free(pnp, M_BUS); free(loc, M_BUS); free(data, M_BUS); return; } /* * A device was added to the tree. We are called just after it successfully * attaches (that is, probe and attach success for this device). No call * is made if a device is merely parented into the tree. See devnomatch * if probe fails. If attach fails, no notification is sent (but maybe * we should have a different message for this). */ static void devadded(device_t dev) { devaddq("+", device_get_nameunit(dev), dev); } /* * A device was removed from the tree. We are called just before this * happens. */ static void devremoved(device_t dev) { devaddq("-", device_get_nameunit(dev), dev); } /* * Called when there's no match for this device. This is only called * the first time that no match happens, so we don't keep getting this * message. Should that prove to be undesirable, we can change it. * This is called when all drivers that can attach to a given bus * decline to accept this device. Other errors may not be detected. */ static void devnomatch(device_t dev) { devaddq("?", "", dev); } static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS) { struct dev_event_info *n1; int dis, error; dis = (devctl_queue_length == 0); error = sysctl_handle_int(oidp, &dis, 0, req); if (error || !req->newptr) return (error); if (mtx_initialized(&devsoftc.mtx)) mtx_lock(&devsoftc.mtx); if (dis) { while (!TAILQ_EMPTY(&devsoftc.devq)) { n1 = TAILQ_FIRST(&devsoftc.devq); TAILQ_REMOVE(&devsoftc.devq, n1, dei_link); free(n1->dei_data, M_BUS); free(n1, M_BUS); } devsoftc.queued = 0; devctl_queue_length = 0; } else { devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN; } if (mtx_initialized(&devsoftc.mtx)) mtx_unlock(&devsoftc.mtx); return (0); } static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS) { struct dev_event_info *n1; int q, error; q = devctl_queue_length; error = sysctl_handle_int(oidp, &q, 0, req); if (error || !req->newptr) return (error); if (q < 0) return (EINVAL); if (mtx_initialized(&devsoftc.mtx)) mtx_lock(&devsoftc.mtx); devctl_queue_length = q; while (devsoftc.queued > devctl_queue_length) { n1 = TAILQ_FIRST(&devsoftc.devq); TAILQ_REMOVE(&devsoftc.devq, n1, dei_link); free(n1->dei_data, M_BUS); free(n1, M_BUS); devsoftc.queued--; } if (mtx_initialized(&devsoftc.mtx)) mtx_unlock(&devsoftc.mtx); return (0); } /** * @brief safely quotes strings that might have double quotes in them. * * The devctl protocol relies on quoted strings having matching quotes. * This routine quotes any internal quotes so the resulting string * is safe to pass to snprintf to construct, for example pnp info strings. * Strings are always terminated with a NUL, but may be truncated if longer * than @p len bytes after quotes. * * @param dst Buffer to hold the string. Must be at least @p len bytes long * @param src Original buffer. * @param len Length of buffer pointed to by @dst, including trailing NUL */ void devctl_safe_quote(char *dst, const char *src, size_t len) { char *walker = dst, *ep = dst + len - 1; if (len == 0) return; while (src != NULL && walker < ep) { if (*src == '"' || *src == '\\') { if (ep - walker < 2) break; *walker++ = '\\'; } *walker++ = *src++; } *walker = '\0'; } /* End of /dev/devctl code */ static TAILQ_HEAD(,device) bus_data_devices; static int bus_data_generation = 1; static kobj_method_t null_methods[] = { KOBJMETHOD_END }; DEFINE_CLASS(null, null_methods, 0); /* * Bus pass implementation */ static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes); int bus_current_pass = BUS_PASS_ROOT; /** * @internal * @brief Register the pass level of a new driver attachment * * Register a new driver attachment's pass level. If no driver * attachment with the same pass level has been added, then @p new * will be added to the global passes list. * * @param new the new driver attachment */ static void driver_register_pass(struct driverlink *new) { struct driverlink *dl; /* We only consider pass numbers during boot. */ if (bus_current_pass == BUS_PASS_DEFAULT) return; /* * Walk the passes list. If we already know about this pass * then there is nothing to do. If we don't, then insert this * driver link into the list. */ TAILQ_FOREACH(dl, &passes, passlink) { if (dl->pass < new->pass) continue; if (dl->pass == new->pass) return; TAILQ_INSERT_BEFORE(dl, new, passlink); return; } TAILQ_INSERT_TAIL(&passes, new, passlink); } /** * @brief Raise the current bus pass * * Raise the current bus pass level to @p pass. Call the BUS_NEW_PASS() * method on the root bus to kick off a new device tree scan for each * new pass level that has at least one driver. */ void bus_set_pass(int pass) { struct driverlink *dl; if (bus_current_pass > pass) panic("Attempt to lower bus pass level"); TAILQ_FOREACH(dl, &passes, passlink) { /* Skip pass values below the current pass level. */ if (dl->pass <= bus_current_pass) continue; /* * Bail once we hit a driver with a pass level that is * too high. */ if (dl->pass > pass) break; /* * Raise the pass level to the next level and rescan * the tree. */ bus_current_pass = dl->pass; BUS_NEW_PASS(root_bus); } /* * If there isn't a driver registered for the requested pass, * then bus_current_pass might still be less than 'pass'. Set * it to 'pass' in that case. */ if (bus_current_pass < pass) bus_current_pass = pass; KASSERT(bus_current_pass == pass, ("Failed to update bus pass level")); } /* * Devclass implementation */ static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses); /** * @internal * @brief Find or create a device class * * If a device class with the name @p classname exists, return it, * otherwise if @p create is non-zero create and return a new device * class. * * If @p parentname is non-NULL, the parent of the devclass is set to * the devclass of that name. * * @param classname the devclass name to find or create * @param parentname the parent devclass name or @c NULL * @param create non-zero to create a devclass */ static devclass_t devclass_find_internal(const char *classname, const char *parentname, int create) { devclass_t dc; PDEBUG(("looking for %s", classname)); if (!classname) return (NULL); TAILQ_FOREACH(dc, &devclasses, link) { if (!strcmp(dc->name, classname)) break; } if (create && !dc) { PDEBUG(("creating %s", classname)); dc = malloc(sizeof(struct devclass) + strlen(classname) + 1, M_BUS, M_NOWAIT | M_ZERO); if (!dc) return (NULL); dc->parent = NULL; dc->name = (char*) (dc + 1); strcpy(dc->name, classname); TAILQ_INIT(&dc->drivers); TAILQ_INSERT_TAIL(&devclasses, dc, link); bus_data_generation_update(); } /* * If a parent class is specified, then set that as our parent so * that this devclass will support drivers for the parent class as * well. If the parent class has the same name don't do this though * as it creates a cycle that can trigger an infinite loop in * device_probe_child() if a device exists for which there is no * suitable driver. */ if (parentname && dc && !dc->parent && strcmp(classname, parentname) != 0) { dc->parent = devclass_find_internal(parentname, NULL, TRUE); dc->parent->flags |= DC_HAS_CHILDREN; } return (dc); } /** * @brief Create a device class * * If a device class with the name @p classname exists, return it, * otherwise create and return a new device class. * * @param classname the devclass name to find or create */ devclass_t devclass_create(const char *classname) { return (devclass_find_internal(classname, NULL, TRUE)); } /** * @brief Find a device class * * If a device class with the name @p classname exists, return it, * otherwise return @c NULL. * * @param classname the devclass name to find */ devclass_t devclass_find(const char *classname) { return (devclass_find_internal(classname, NULL, FALSE)); } /** * @brief Register that a device driver has been added to a devclass * * Register that a device driver has been added to a devclass. This * is called by devclass_add_driver to accomplish the recursive * notification of all the children classes of dc, as well as dc. * Each layer will have BUS_DRIVER_ADDED() called for all instances of * the devclass. * * We do a full search here of the devclass list at each iteration * level to save storing children-lists in the devclass structure. If * we ever move beyond a few dozen devices doing this, we may need to * reevaluate... * * @param dc the devclass to edit * @param driver the driver that was just added */ static void devclass_driver_added(devclass_t dc, driver_t *driver) { devclass_t parent; int i; /* * Call BUS_DRIVER_ADDED for any existing buses in this class. */ for (i = 0; i < dc->maxunit; i++) if (dc->devices[i] && device_is_attached(dc->devices[i])) BUS_DRIVER_ADDED(dc->devices[i], driver); /* * Walk through the children classes. Since we only keep a * single parent pointer around, we walk the entire list of * devclasses looking for children. We set the * DC_HAS_CHILDREN flag when a child devclass is created on * the parent, so we only walk the list for those devclasses * that have children. */ if (!(dc->flags & DC_HAS_CHILDREN)) return; parent = dc; TAILQ_FOREACH(dc, &devclasses, link) { if (dc->parent == parent) devclass_driver_added(dc, driver); } } /** * @brief Add a device driver to a device class * * Add a device driver to a devclass. This is normally called * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of * all devices in the devclass will be called to allow them to attempt * to re-probe any unmatched children. * * @param dc the devclass to edit * @param driver the driver to register */ int devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp) { driverlink_t dl; const char *parentname; PDEBUG(("%s", DRIVERNAME(driver))); /* Don't allow invalid pass values. */ if (pass <= BUS_PASS_ROOT) return (EINVAL); dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO); if (!dl) return (ENOMEM); /* * Compile the driver's methods. Also increase the reference count * so that the class doesn't get freed when the last instance * goes. This means we can safely use static methods and avoids a * double-free in devclass_delete_driver. */ kobj_class_compile((kobj_class_t) driver); /* * If the driver has any base classes, make the * devclass inherit from the devclass of the driver's * first base class. This will allow the system to * search for drivers in both devclasses for children * of a device using this driver. */ if (driver->baseclasses) parentname = driver->baseclasses[0]->name; else parentname = NULL; *dcp = devclass_find_internal(driver->name, parentname, TRUE); dl->driver = driver; TAILQ_INSERT_TAIL(&dc->drivers, dl, link); driver->refs++; /* XXX: kobj_mtx */ dl->pass = pass; driver_register_pass(dl); devclass_driver_added(dc, driver); bus_data_generation_update(); return (0); } /** * @brief Register that a device driver has been deleted from a devclass * * Register that a device driver has been removed from a devclass. * This is called by devclass_delete_driver to accomplish the * recursive notification of all the children classes of busclass, as * well as busclass. Each layer will attempt to detach the driver * from any devices that are children of the bus's devclass. The function * will return an error if a device fails to detach. * * We do a full search here of the devclass list at each iteration * level to save storing children-lists in the devclass structure. If * we ever move beyond a few dozen devices doing this, we may need to * reevaluate... * * @param busclass the devclass of the parent bus * @param dc the devclass of the driver being deleted * @param driver the driver being deleted */ static int devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver) { devclass_t parent; device_t dev; int error, i; /* * Disassociate from any devices. We iterate through all the * devices in the devclass of the driver and detach any which are * using the driver and which have a parent in the devclass which * we are deleting from. * * Note that since a driver can be in multiple devclasses, we * should not detach devices which are not children of devices in * the affected devclass. */ for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { dev = dc->devices[i]; if (dev->driver == driver && dev->parent && dev->parent->devclass == busclass) { if ((error = device_detach(dev)) != 0) return (error); BUS_PROBE_NOMATCH(dev->parent, dev); devnomatch(dev); dev->flags |= DF_DONENOMATCH; } } } /* * Walk through the children classes. Since we only keep a * single parent pointer around, we walk the entire list of * devclasses looking for children. We set the * DC_HAS_CHILDREN flag when a child devclass is created on * the parent, so we only walk the list for those devclasses * that have children. */ if (!(busclass->flags & DC_HAS_CHILDREN)) return (0); parent = busclass; TAILQ_FOREACH(busclass, &devclasses, link) { if (busclass->parent == parent) { error = devclass_driver_deleted(busclass, dc, driver); if (error) return (error); } } return (0); } /** * @brief Delete a device driver from a device class * * Delete a device driver from a devclass. This is normally called * automatically by DRIVER_MODULE(). * * If the driver is currently attached to any devices, * devclass_delete_driver() will first attempt to detach from each * device. If one of the detach calls fails, the driver will not be * deleted. * * @param dc the devclass to edit * @param driver the driver to unregister */ int devclass_delete_driver(devclass_t busclass, driver_t *driver) { devclass_t dc = devclass_find(driver->name); driverlink_t dl; int error; PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass))); if (!dc) return (0); /* * Find the link structure in the bus' list of drivers. */ TAILQ_FOREACH(dl, &busclass->drivers, link) { if (dl->driver == driver) break; } if (!dl) { PDEBUG(("%s not found in %s list", driver->name, busclass->name)); return (ENOENT); } error = devclass_driver_deleted(busclass, dc, driver); if (error != 0) return (error); TAILQ_REMOVE(&busclass->drivers, dl, link); free(dl, M_BUS); /* XXX: kobj_mtx */ driver->refs--; if (driver->refs == 0) kobj_class_free((kobj_class_t) driver); bus_data_generation_update(); return (0); } /** * @brief Quiesces a set of device drivers from a device class * * Quiesce a device driver from a devclass. This is normally called * automatically by DRIVER_MODULE(). * * If the driver is currently attached to any devices, * devclass_quiesece_driver() will first attempt to quiesce each * device. * * @param dc the devclass to edit * @param driver the driver to unregister */ static int devclass_quiesce_driver(devclass_t busclass, driver_t *driver) { devclass_t dc = devclass_find(driver->name); driverlink_t dl; device_t dev; int i; int error; PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass))); if (!dc) return (0); /* * Find the link structure in the bus' list of drivers. */ TAILQ_FOREACH(dl, &busclass->drivers, link) { if (dl->driver == driver) break; } if (!dl) { PDEBUG(("%s not found in %s list", driver->name, busclass->name)); return (ENOENT); } /* * Quiesce all devices. We iterate through all the devices in * the devclass of the driver and quiesce any which are using * the driver and which have a parent in the devclass which we * are quiescing. * * Note that since a driver can be in multiple devclasses, we * should not quiesce devices which are not children of * devices in the affected devclass. */ for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { dev = dc->devices[i]; if (dev->driver == driver && dev->parent && dev->parent->devclass == busclass) { if ((error = device_quiesce(dev)) != 0) return (error); } } } return (0); } /** * @internal */ static driverlink_t devclass_find_driver_internal(devclass_t dc, const char *classname) { driverlink_t dl; PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc))); TAILQ_FOREACH(dl, &dc->drivers, link) { if (!strcmp(dl->driver->name, classname)) return (dl); } PDEBUG(("not found")); return (NULL); } /** * @brief Return the name of the devclass */ const char * devclass_get_name(devclass_t dc) { return (dc->name); } /** * @brief Find a device given a unit number * * @param dc the devclass to search * @param unit the unit number to search for * * @returns the device with the given unit number or @c * NULL if there is no such device */ device_t devclass_get_device(devclass_t dc, int unit) { if (dc == NULL || unit < 0 || unit >= dc->maxunit) return (NULL); return (dc->devices[unit]); } /** * @brief Find the softc field of a device given a unit number * * @param dc the devclass to search * @param unit the unit number to search for * * @returns the softc field of the device with the given * unit number or @c NULL if there is no such * device */ void * devclass_get_softc(devclass_t dc, int unit) { device_t dev; dev = devclass_get_device(dc, unit); if (!dev) return (NULL); return (device_get_softc(dev)); } /** * @brief Get a list of devices in the devclass * * An array containing a list of all the devices in the given devclass * is allocated and returned in @p *devlistp. The number of devices * in the array is returned in @p *devcountp. The caller should free * the array using @c free(p, M_TEMP), even if @p *devcountp is 0. * * @param dc the devclass to examine * @param devlistp points at location for array pointer return * value * @param devcountp points at location for array size return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp) { int count, i; device_t *list; count = devclass_get_count(dc); list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO); if (!list) return (ENOMEM); count = 0; for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { list[count] = dc->devices[i]; count++; } } *devlistp = list; *devcountp = count; return (0); } /** * @brief Get a list of drivers in the devclass * * An array containing a list of pointers to all the drivers in the * given devclass is allocated and returned in @p *listp. The number * of drivers in the array is returned in @p *countp. The caller should * free the array using @c free(p, M_TEMP). * * @param dc the devclass to examine * @param listp gives location for array pointer return value * @param countp gives location for number of array elements * return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp) { driverlink_t dl; driver_t **list; int count; count = 0; TAILQ_FOREACH(dl, &dc->drivers, link) count++; list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT); if (list == NULL) return (ENOMEM); count = 0; TAILQ_FOREACH(dl, &dc->drivers, link) { list[count] = dl->driver; count++; } *listp = list; *countp = count; return (0); } /** * @brief Get the number of devices in a devclass * * @param dc the devclass to examine */ int devclass_get_count(devclass_t dc) { int count, i; count = 0; for (i = 0; i < dc->maxunit; i++) if (dc->devices[i]) count++; return (count); } /** * @brief Get the maximum unit number used in a devclass * * Note that this is one greater than the highest currently-allocated * unit. If a null devclass_t is passed in, -1 is returned to indicate * that not even the devclass has been allocated yet. * * @param dc the devclass to examine */ int devclass_get_maxunit(devclass_t dc) { if (dc == NULL) return (-1); return (dc->maxunit); } /** * @brief Find a free unit number in a devclass * * This function searches for the first unused unit number greater * that or equal to @p unit. * * @param dc the devclass to examine * @param unit the first unit number to check */ int devclass_find_free_unit(devclass_t dc, int unit) { if (dc == NULL) return (unit); while (unit < dc->maxunit && dc->devices[unit] != NULL) unit++; return (unit); } /** * @brief Set the parent of a devclass * * The parent class is normally initialised automatically by * DRIVER_MODULE(). * * @param dc the devclass to edit * @param pdc the new parent devclass */ void devclass_set_parent(devclass_t dc, devclass_t pdc) { dc->parent = pdc; } /** * @brief Get the parent of a devclass * * @param dc the devclass to examine */ devclass_t devclass_get_parent(devclass_t dc) { return (dc->parent); } struct sysctl_ctx_list * devclass_get_sysctl_ctx(devclass_t dc) { return (&dc->sysctl_ctx); } struct sysctl_oid * devclass_get_sysctl_tree(devclass_t dc) { return (dc->sysctl_tree); } /** * @internal * @brief Allocate a unit number * * On entry, @p *unitp is the desired unit number (or @c -1 if any * will do). The allocated unit number is returned in @p *unitp. * @param dc the devclass to allocate from * @param unitp points at the location for the allocated unit * number * * @retval 0 success * @retval EEXIST the requested unit number is already allocated * @retval ENOMEM memory allocation failure */ static int devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp) { const char *s; int unit = *unitp; PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc))); /* Ask the parent bus if it wants to wire this device. */ if (unit == -1) BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name, &unit); /* If we were given a wired unit number, check for existing device */ /* XXX imp XXX */ if (unit != -1) { if (unit >= 0 && unit < dc->maxunit && dc->devices[unit] != NULL) { if (bootverbose) printf("%s: %s%d already exists; skipping it\n", dc->name, dc->name, *unitp); return (EEXIST); } } else { /* Unwired device, find the next available slot for it */ unit = 0; for (unit = 0;; unit++) { /* If there is an "at" hint for a unit then skip it. */ if (resource_string_value(dc->name, unit, "at", &s) == 0) continue; /* If this device slot is already in use, skip it. */ if (unit < dc->maxunit && dc->devices[unit] != NULL) continue; break; } } /* * We've selected a unit beyond the length of the table, so let's * extend the table to make room for all units up to and including * this one. */ if (unit >= dc->maxunit) { device_t *newlist, *oldlist; int newsize; oldlist = dc->devices; newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t)); newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT); if (!newlist) return (ENOMEM); if (oldlist != NULL) bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit); bzero(newlist + dc->maxunit, sizeof(device_t) * (newsize - dc->maxunit)); dc->devices = newlist; dc->maxunit = newsize; if (oldlist != NULL) free(oldlist, M_BUS); } PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc))); *unitp = unit; return (0); } /** * @internal * @brief Add a device to a devclass * * A unit number is allocated for the device (using the device's * preferred unit number if any) and the device is registered in the * devclass. This allows the device to be looked up by its unit * number, e.g. by decoding a dev_t minor number. * * @param dc the devclass to add to * @param dev the device to add * * @retval 0 success * @retval EEXIST the requested unit number is already allocated * @retval ENOMEM memory allocation failure */ static int devclass_add_device(devclass_t dc, device_t dev) { int buflen, error; PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX); if (buflen < 0) return (ENOMEM); dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO); if (!dev->nameunit) return (ENOMEM); if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) { free(dev->nameunit, M_BUS); dev->nameunit = NULL; return (error); } dc->devices[dev->unit] = dev; dev->devclass = dc; snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit); return (0); } /** * @internal * @brief Delete a device from a devclass * * The device is removed from the devclass's device list and its unit * number is freed. * @param dc the devclass to delete from * @param dev the device to delete * * @retval 0 success */ static int devclass_delete_device(devclass_t dc, device_t dev) { if (!dc || !dev) return (0); PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); if (dev->devclass != dc || dc->devices[dev->unit] != dev) panic("devclass_delete_device: inconsistent device class"); dc->devices[dev->unit] = NULL; if (dev->flags & DF_WILDCARD) dev->unit = -1; dev->devclass = NULL; free(dev->nameunit, M_BUS); dev->nameunit = NULL; return (0); } /** * @internal * @brief Make a new device and add it as a child of @p parent * * @param parent the parent of the new device * @param name the devclass name of the new device or @c NULL * to leave the devclass unspecified * @parem unit the unit number of the new device of @c -1 to * leave the unit number unspecified * * @returns the new device */ static device_t make_device(device_t parent, const char *name, int unit) { device_t dev; devclass_t dc; PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit)); if (name) { dc = devclass_find_internal(name, NULL, TRUE); if (!dc) { printf("make_device: can't find device class %s\n", name); return (NULL); } } else { dc = NULL; } dev = malloc(sizeof(*dev), M_BUS, M_NOWAIT|M_ZERO); if (!dev) return (NULL); dev->parent = parent; TAILQ_INIT(&dev->children); kobj_init((kobj_t) dev, &null_class); dev->driver = NULL; dev->devclass = NULL; dev->unit = unit; dev->nameunit = NULL; dev->desc = NULL; dev->busy = 0; dev->devflags = 0; dev->flags = DF_ENABLED; dev->order = 0; if (unit == -1) dev->flags |= DF_WILDCARD; if (name) { dev->flags |= DF_FIXEDCLASS; if (devclass_add_device(dc, dev)) { kobj_delete((kobj_t) dev, M_BUS); return (NULL); } } dev->ivars = NULL; dev->softc = NULL; dev->state = DS_NOTPRESENT; TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink); bus_data_generation_update(); return (dev); } /** * @internal * @brief Print a description of a device. */ static int device_print_child(device_t dev, device_t child) { int retval = 0; if (device_is_alive(child)) retval += BUS_PRINT_CHILD(dev, child); else retval += device_printf(child, " not found\n"); return (retval); } /** * @brief Create a new device * * This creates a new device and adds it as a child of an existing * parent device. The new device will be added after the last existing * child with order zero. * * @param dev the device which will be the parent of the * new child device * @param name devclass name for new device or @c NULL if not * specified * @param unit unit number for new device or @c -1 if not * specified * * @returns the new device */ device_t device_add_child(device_t dev, const char *name, int unit) { return (device_add_child_ordered(dev, 0, name, unit)); } /** * @brief Create a new device * * This creates a new device and adds it as a child of an existing * parent device. The new device will be added after the last existing * child with the same order. * * @param dev the device which will be the parent of the * new child device * @param order a value which is used to partially sort the * children of @p dev - devices created using * lower values of @p order appear first in @p * dev's list of children * @param name devclass name for new device or @c NULL if not * specified * @param unit unit number for new device or @c -1 if not * specified * * @returns the new device */ device_t device_add_child_ordered(device_t dev, u_int order, const char *name, int unit) { device_t child; device_t place; PDEBUG(("%s at %s with order %u as unit %d", name, DEVICENAME(dev), order, unit)); KASSERT(name != NULL || unit == -1, ("child device with wildcard name and specific unit number")); child = make_device(dev, name, unit); if (child == NULL) return (child); child->order = order; TAILQ_FOREACH(place, &dev->children, link) { if (place->order > order) break; } if (place) { /* * The device 'place' is the first device whose order is * greater than the new child. */ TAILQ_INSERT_BEFORE(place, child, link); } else { /* * The new child's order is greater or equal to the order of * any existing device. Add the child to the tail of the list. */ TAILQ_INSERT_TAIL(&dev->children, child, link); } bus_data_generation_update(); return (child); } /** * @brief Delete a device * * This function deletes a device along with all of its children. If * the device currently has a driver attached to it, the device is * detached first using device_detach(). * * @param dev the parent device * @param child the device to delete * * @retval 0 success * @retval non-zero a unit error code describing the error */ int device_delete_child(device_t dev, device_t child) { int error; device_t grandchild; PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev))); /* detach parent before deleting children, if any */ if ((error = device_detach(child)) != 0) return (error); /* remove children second */ while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) { error = device_delete_child(child, grandchild); if (error) return (error); } if (child->devclass) devclass_delete_device(child->devclass, child); if (child->parent) BUS_CHILD_DELETED(dev, child); TAILQ_REMOVE(&dev->children, child, link); TAILQ_REMOVE(&bus_data_devices, child, devlink); kobj_delete((kobj_t) child, M_BUS); bus_data_generation_update(); return (0); } /** * @brief Delete all children devices of the given device, if any. * * This function deletes all children devices of the given device, if * any, using the device_delete_child() function for each device it * finds. If a child device cannot be deleted, this function will * return an error code. * * @param dev the parent device * * @retval 0 success * @retval non-zero a device would not detach */ int device_delete_children(device_t dev) { device_t child; int error; PDEBUG(("Deleting all children of %s", DEVICENAME(dev))); error = 0; while ((child = TAILQ_FIRST(&dev->children)) != NULL) { error = device_delete_child(dev, child); if (error) { PDEBUG(("Failed deleting %s", DEVICENAME(child))); break; } } return (error); } /** * @brief Find a device given a unit number * * This is similar to devclass_get_devices() but only searches for * devices which have @p dev as a parent. * * @param dev the parent device to search * @param unit the unit number to search for. If the unit is -1, * return the first child of @p dev which has name * @p classname (that is, the one with the lowest unit.) * * @returns the device with the given unit number or @c * NULL if there is no such device */ device_t device_find_child(device_t dev, const char *classname, int unit) { devclass_t dc; device_t child; dc = devclass_find(classname); if (!dc) return (NULL); if (unit != -1) { child = devclass_get_device(dc, unit); if (child && child->parent == dev) return (child); } else { for (unit = 0; unit < devclass_get_maxunit(dc); unit++) { child = devclass_get_device(dc, unit); if (child && child->parent == dev) return (child); } } return (NULL); } /** * @internal */ static driverlink_t first_matching_driver(devclass_t dc, device_t dev) { if (dev->devclass) return (devclass_find_driver_internal(dc, dev->devclass->name)); return (TAILQ_FIRST(&dc->drivers)); } /** * @internal */ static driverlink_t next_matching_driver(devclass_t dc, device_t dev, driverlink_t last) { if (dev->devclass) { driverlink_t dl; for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link)) if (!strcmp(dev->devclass->name, dl->driver->name)) return (dl); return (NULL); } return (TAILQ_NEXT(last, link)); } /** * @internal */ int device_probe_child(device_t dev, device_t child) { devclass_t dc; driverlink_t best = NULL; driverlink_t dl; int result, pri = 0; int hasclass = (child->devclass != NULL); GIANT_REQUIRED; dc = dev->devclass; if (!dc) panic("device_probe_child: parent device has no devclass"); /* * If the state is already probed, then return. However, don't * return if we can rebid this object. */ if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0) return (0); for (; dc; dc = dc->parent) { for (dl = first_matching_driver(dc, child); dl; dl = next_matching_driver(dc, child, dl)) { /* If this driver's pass is too high, then ignore it. */ if (dl->pass > bus_current_pass) continue; PDEBUG(("Trying %s", DRIVERNAME(dl->driver))); result = device_set_driver(child, dl->driver); if (result == ENOMEM) return (result); else if (result != 0) continue; if (!hasclass) { if (device_set_devclass(child, dl->driver->name) != 0) { char const * devname = device_get_name(child); if (devname == NULL) devname = "(unknown)"; printf("driver bug: Unable to set " "devclass (class: %s " "devname: %s)\n", dl->driver->name, devname); (void)device_set_driver(child, NULL); continue; } } /* Fetch any flags for the device before probing. */ resource_int_value(dl->driver->name, child->unit, "flags", &child->devflags); result = DEVICE_PROBE(child); /* Reset flags and devclass before the next probe. */ child->devflags = 0; if (!hasclass) (void)device_set_devclass(child, NULL); /* * If the driver returns SUCCESS, there can be * no higher match for this device. */ if (result == 0) { best = dl; pri = 0; break; } /* * Reset DF_QUIET in case this driver doesn't * end up as the best driver. */ device_verbose(child); /* * Probes that return BUS_PROBE_NOWILDCARD or lower * only match on devices whose driver was explicitly * specified. */ if (result <= BUS_PROBE_NOWILDCARD && !(child->flags & DF_FIXEDCLASS)) { result = ENXIO; } /* * The driver returned an error so it * certainly doesn't match. */ if (result > 0) { (void)device_set_driver(child, NULL); continue; } /* * A priority lower than SUCCESS, remember the * best matching driver. Initialise the value * of pri for the first match. */ if (best == NULL || result > pri) { best = dl; pri = result; continue; } } /* * If we have an unambiguous match in this devclass, * don't look in the parent. */ if (best && pri == 0) break; } /* * If we found a driver, change state and initialise the devclass. */ /* XXX What happens if we rebid and got no best? */ if (best) { /* * If this device was attached, and we were asked to * rescan, and it is a different driver, then we have * to detach the old driver and reattach this new one. * Note, we don't have to check for DF_REBID here * because if the state is > DS_ALIVE, we know it must * be. * * This assumes that all DF_REBID drivers can have * their probe routine called at any time and that * they are idempotent as well as completely benign in * normal operations. * * We also have to make sure that the detach * succeeded, otherwise we fail the operation (or * maybe it should just fail silently? I'm torn). */ if (child->state > DS_ALIVE && best->driver != child->driver) if ((result = device_detach(dev)) != 0) return (result); /* Set the winning driver, devclass, and flags. */ if (!child->devclass) { result = device_set_devclass(child, best->driver->name); if (result != 0) return (result); } result = device_set_driver(child, best->driver); if (result != 0) return (result); resource_int_value(best->driver->name, child->unit, "flags", &child->devflags); if (pri < 0) { /* * A bit bogus. Call the probe method again to make * sure that we have the right description. */ DEVICE_PROBE(child); #if 0 child->flags |= DF_REBID; #endif } else child->flags &= ~DF_REBID; child->state = DS_ALIVE; bus_data_generation_update(); return (0); } return (ENXIO); } /** * @brief Return the parent of a device */ device_t device_get_parent(device_t dev) { return (dev->parent); } /** * @brief Get a list of children of a device * * An array containing a list of all the children of the given device * is allocated and returned in @p *devlistp. The number of devices * in the array is returned in @p *devcountp. The caller should free * the array using @c free(p, M_TEMP). * * @param dev the device to examine * @param devlistp points at location for array pointer return * value * @param devcountp points at location for array size return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int device_get_children(device_t dev, device_t **devlistp, int *devcountp) { int count; device_t child; device_t *list; count = 0; TAILQ_FOREACH(child, &dev->children, link) { count++; } if (count == 0) { *devlistp = NULL; *devcountp = 0; return (0); } list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO); if (!list) return (ENOMEM); count = 0; TAILQ_FOREACH(child, &dev->children, link) { list[count] = child; count++; } *devlistp = list; *devcountp = count; return (0); } /** * @brief Return the current driver for the device or @c NULL if there * is no driver currently attached */ driver_t * device_get_driver(device_t dev) { return (dev->driver); } /** * @brief Return the current devclass for the device or @c NULL if * there is none. */ devclass_t device_get_devclass(device_t dev) { return (dev->devclass); } /** * @brief Return the name of the device's devclass or @c NULL if there * is none. */ const char * device_get_name(device_t dev) { if (dev != NULL && dev->devclass) return (devclass_get_name(dev->devclass)); return (NULL); } /** * @brief Return a string containing the device's devclass name * followed by an ascii representation of the device's unit number * (e.g. @c "foo2"). */ const char * device_get_nameunit(device_t dev) { return (dev->nameunit); } /** * @brief Return the device's unit number. */ int device_get_unit(device_t dev) { return (dev->unit); } /** * @brief Return the device's description string */ const char * device_get_desc(device_t dev) { return (dev->desc); } /** * @brief Return the device's flags */ uint32_t device_get_flags(device_t dev) { return (dev->devflags); } struct sysctl_ctx_list * device_get_sysctl_ctx(device_t dev) { return (&dev->sysctl_ctx); } struct sysctl_oid * device_get_sysctl_tree(device_t dev) { return (dev->sysctl_tree); } /** * @brief Print the name of the device followed by a colon and a space * * @returns the number of characters printed */ int device_print_prettyname(device_t dev) { const char *name = device_get_name(dev); if (name == NULL) return (printf("unknown: ")); return (printf("%s%d: ", name, device_get_unit(dev))); } /** * @brief Print the name of the device followed by a colon, a space * and the result of calling vprintf() with the value of @p fmt and * the following arguments. * * @returns the number of characters printed */ int device_printf(device_t dev, const char * fmt, ...) { va_list ap; int retval; retval = device_print_prettyname(dev); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } /** * @internal */ static void device_set_desc_internal(device_t dev, const char* desc, int copy) { if (dev->desc && (dev->flags & DF_DESCMALLOCED)) { free(dev->desc, M_BUS); dev->flags &= ~DF_DESCMALLOCED; dev->desc = NULL; } if (copy && desc) { dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT); if (dev->desc) { strcpy(dev->desc, desc); dev->flags |= DF_DESCMALLOCED; } } else { /* Avoid a -Wcast-qual warning */ dev->desc = (char *)(uintptr_t) desc; } bus_data_generation_update(); } /** * @brief Set the device's description * * The value of @c desc should be a string constant that will not * change (at least until the description is changed in a subsequent * call to device_set_desc() or device_set_desc_copy()). */ void device_set_desc(device_t dev, const char* desc) { device_set_desc_internal(dev, desc, FALSE); } /** * @brief Set the device's description * * The string pointed to by @c desc is copied. Use this function if * the device description is generated, (e.g. with sprintf()). */ void device_set_desc_copy(device_t dev, const char* desc) { device_set_desc_internal(dev, desc, TRUE); } /** * @brief Set the device's flags */ void device_set_flags(device_t dev, uint32_t flags) { dev->devflags = flags; } /** * @brief Return the device's softc field * * The softc is allocated and zeroed when a driver is attached, based * on the size field of the driver. */ void * device_get_softc(device_t dev) { return (dev->softc); } /** * @brief Set the device's softc field * * Most drivers do not need to use this since the softc is allocated * automatically when the driver is attached. */ void device_set_softc(device_t dev, void *softc) { if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) free(dev->softc, M_BUS_SC); dev->softc = softc; if (dev->softc) dev->flags |= DF_EXTERNALSOFTC; else dev->flags &= ~DF_EXTERNALSOFTC; } /** * @brief Free claimed softc * * Most drivers do not need to use this since the softc is freed * automatically when the driver is detached. */ void device_free_softc(void *softc) { free(softc, M_BUS_SC); } /** * @brief Claim softc * * This function can be used to let the driver free the automatically * allocated softc using "device_free_softc()". This function is * useful when the driver is refcounting the softc and the softc * cannot be freed when the "device_detach" method is called. */ void device_claim_softc(device_t dev) { if (dev->softc) dev->flags |= DF_EXTERNALSOFTC; else dev->flags &= ~DF_EXTERNALSOFTC; } /** * @brief Get the device's ivars field * * The ivars field is used by the parent device to store per-device * state (e.g. the physical location of the device or a list of * resources). */ void * device_get_ivars(device_t dev) { KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)")); return (dev->ivars); } /** * @brief Set the device's ivars field */ void device_set_ivars(device_t dev, void * ivars) { KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)")); dev->ivars = ivars; } /** * @brief Return the device's state */ device_state_t device_get_state(device_t dev) { return (dev->state); } /** * @brief Set the DF_ENABLED flag for the device */ void device_enable(device_t dev) { dev->flags |= DF_ENABLED; } /** * @brief Clear the DF_ENABLED flag for the device */ void device_disable(device_t dev) { dev->flags &= ~DF_ENABLED; } /** * @brief Increment the busy counter for the device */ void device_busy(device_t dev) { if (dev->state < DS_ATTACHING) panic("device_busy: called for unattached device"); if (dev->busy == 0 && dev->parent) device_busy(dev->parent); dev->busy++; if (dev->state == DS_ATTACHED) dev->state = DS_BUSY; } /** * @brief Decrement the busy counter for the device */ void device_unbusy(device_t dev) { if (dev->busy != 0 && dev->state != DS_BUSY && dev->state != DS_ATTACHING) panic("device_unbusy: called for non-busy device %s", device_get_nameunit(dev)); dev->busy--; if (dev->busy == 0) { if (dev->parent) device_unbusy(dev->parent); if (dev->state == DS_BUSY) dev->state = DS_ATTACHED; } } /** * @brief Set the DF_QUIET flag for the device */ void device_quiet(device_t dev) { dev->flags |= DF_QUIET; } /** * @brief Clear the DF_QUIET flag for the device */ void device_verbose(device_t dev) { dev->flags &= ~DF_QUIET; } /** * @brief Return non-zero if the DF_QUIET flag is set on the device */ int device_is_quiet(device_t dev) { return ((dev->flags & DF_QUIET) != 0); } /** * @brief Return non-zero if the DF_ENABLED flag is set on the device */ int device_is_enabled(device_t dev) { return ((dev->flags & DF_ENABLED) != 0); } /** * @brief Return non-zero if the device was successfully probed */ int device_is_alive(device_t dev) { return (dev->state >= DS_ALIVE); } /** * @brief Return non-zero if the device currently has a driver * attached to it */ int device_is_attached(device_t dev) { return (dev->state >= DS_ATTACHED); } /** * @brief Return non-zero if the device is currently suspended. */ int device_is_suspended(device_t dev) { return ((dev->flags & DF_SUSPENDED) != 0); } /** * @brief Set the devclass of a device * @see devclass_add_device(). */ int device_set_devclass(device_t dev, const char *classname) { devclass_t dc; int error; if (!classname) { if (dev->devclass) devclass_delete_device(dev->devclass, dev); return (0); } if (dev->devclass) { printf("device_set_devclass: device class already set\n"); return (EINVAL); } dc = devclass_find_internal(classname, NULL, TRUE); if (!dc) return (ENOMEM); error = devclass_add_device(dc, dev); bus_data_generation_update(); return (error); } /** * @brief Set the devclass of a device and mark the devclass fixed. * @see device_set_devclass() */ int device_set_devclass_fixed(device_t dev, const char *classname) { int error; if (classname == NULL) return (EINVAL); error = device_set_devclass(dev, classname); if (error) return (error); dev->flags |= DF_FIXEDCLASS; return (0); } /** * @brief Set the driver of a device * * @retval 0 success * @retval EBUSY the device already has a driver attached * @retval ENOMEM a memory allocation failure occurred */ int device_set_driver(device_t dev, driver_t *driver) { if (dev->state >= DS_ATTACHED) return (EBUSY); if (dev->driver == driver) return (0); if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) { free(dev->softc, M_BUS_SC); dev->softc = NULL; } device_set_desc(dev, NULL); kobj_delete((kobj_t) dev, NULL); dev->driver = driver; if (driver) { kobj_init((kobj_t) dev, (kobj_class_t) driver); if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) { dev->softc = malloc(driver->size, M_BUS_SC, M_NOWAIT | M_ZERO); if (!dev->softc) { kobj_delete((kobj_t) dev, NULL); kobj_init((kobj_t) dev, &null_class); dev->driver = NULL; return (ENOMEM); } } } else { kobj_init((kobj_t) dev, &null_class); } bus_data_generation_update(); return (0); } /** * @brief Probe a device, and return this status. * * This function is the core of the device autoconfiguration * system. Its purpose is to select a suitable driver for a device and * then call that driver to initialise the hardware appropriately. The * driver is selected by calling the DEVICE_PROBE() method of a set of * candidate drivers and then choosing the driver which returned the * best value. This driver is then attached to the device using * device_attach(). * * The set of suitable drivers is taken from the list of drivers in * the parent device's devclass. If the device was originally created * with a specific class name (see device_add_child()), only drivers * with that name are probed, otherwise all drivers in the devclass * are probed. If no drivers return successful probe values in the * parent devclass, the search continues in the parent of that * devclass (see devclass_get_parent()) if any. * * @param dev the device to initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code * @retval -1 Device already attached */ int device_probe(device_t dev) { int error; GIANT_REQUIRED; if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0) return (-1); if (!(dev->flags & DF_ENABLED)) { if (bootverbose && device_get_name(dev) != NULL) { device_print_prettyname(dev); printf("not probed (disabled)\n"); } return (-1); } if ((error = device_probe_child(dev->parent, dev)) != 0) { if (bus_current_pass == BUS_PASS_DEFAULT && !(dev->flags & DF_DONENOMATCH)) { BUS_PROBE_NOMATCH(dev->parent, dev); devnomatch(dev); dev->flags |= DF_DONENOMATCH; } return (error); } return (0); } /** * @brief Probe a device and attach a driver if possible * * calls device_probe() and attaches if that was successful. */ int device_probe_and_attach(device_t dev) { int error; GIANT_REQUIRED; error = device_probe(dev); if (error == -1) return (0); else if (error != 0) return (error); CURVNET_SET_QUIET(vnet0); error = device_attach(dev); CURVNET_RESTORE(); return error; } /** * @brief Attach a device driver to a device * * This function is a wrapper around the DEVICE_ATTACH() driver * method. In addition to calling DEVICE_ATTACH(), it initialises the * device's sysctl tree, optionally prints a description of the device * and queues a notification event for user-based device management * services. * * Normally this function is only called internally from * device_probe_and_attach(). * * @param dev the device to initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_attach(device_t dev) { uint64_t attachtime; int error; if (resource_disabled(dev->driver->name, dev->unit)) { device_disable(dev); if (bootverbose) device_printf(dev, "disabled via hints entry\n"); return (ENXIO); } device_sysctl_init(dev); if (!device_is_quiet(dev)) device_print_child(dev->parent, dev); attachtime = get_cyclecount(); dev->state = DS_ATTACHING; if ((error = DEVICE_ATTACH(dev)) != 0) { printf("device_attach: %s%d attach returned %d\n", dev->driver->name, dev->unit, error); if (!(dev->flags & DF_FIXEDCLASS)) devclass_delete_device(dev->devclass, dev); (void)device_set_driver(dev, NULL); device_sysctl_fini(dev); KASSERT(dev->busy == 0, ("attach failed but busy")); dev->state = DS_NOTPRESENT; return (error); } attachtime = get_cyclecount() - attachtime; /* * 4 bits per device is a reasonable value for desktop and server * hardware with good get_cyclecount() implementations, but WILL * need to be adjusted on other platforms. */ #define RANDOM_PROBE_BIT_GUESS 4 if (bootverbose) printf("random: harvesting attach, %zu bytes (%d bits) from %s%d\n", sizeof(attachtime), RANDOM_PROBE_BIT_GUESS, dev->driver->name, dev->unit); random_harvest_direct(&attachtime, sizeof(attachtime), RANDOM_PROBE_BIT_GUESS, RANDOM_ATTACH); device_sysctl_update(dev); if (dev->busy) dev->state = DS_BUSY; else dev->state = DS_ATTACHED; dev->flags &= ~DF_DONENOMATCH; EVENTHANDLER_INVOKE(device_attach, dev); devadded(dev); return (0); } /** * @brief Detach a driver from a device * * This function is a wrapper around the DEVICE_DETACH() driver * method. If the call to DEVICE_DETACH() succeeds, it calls * BUS_CHILD_DETACHED() for the parent of @p dev, queues a * notification event for user-based device management services and * cleans up the device's sysctl tree. * * @param dev the device to un-initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_detach(device_t dev) { int error; GIANT_REQUIRED; PDEBUG(("%s", DEVICENAME(dev))); if (dev->state == DS_BUSY) return (EBUSY); if (dev->state != DS_ATTACHED) return (0); EVENTHANDLER_INVOKE(device_detach, dev, EVHDEV_DETACH_BEGIN); if ((error = DEVICE_DETACH(dev)) != 0) { EVENTHANDLER_INVOKE(device_detach, dev, EVHDEV_DETACH_FAILED); return (error); } else { EVENTHANDLER_INVOKE(device_detach, dev, EVHDEV_DETACH_COMPLETE); } devremoved(dev); if (!device_is_quiet(dev)) device_printf(dev, "detached\n"); if (dev->parent) BUS_CHILD_DETACHED(dev->parent, dev); if (!(dev->flags & DF_FIXEDCLASS)) devclass_delete_device(dev->devclass, dev); device_verbose(dev); dev->state = DS_NOTPRESENT; (void)device_set_driver(dev, NULL); device_sysctl_fini(dev); return (0); } /** * @brief Tells a driver to quiesce itself. * * This function is a wrapper around the DEVICE_QUIESCE() driver * method. If the call to DEVICE_QUIESCE() succeeds. * * @param dev the device to quiesce * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_quiesce(device_t dev) { PDEBUG(("%s", DEVICENAME(dev))); if (dev->state == DS_BUSY) return (EBUSY); if (dev->state != DS_ATTACHED) return (0); return (DEVICE_QUIESCE(dev)); } /** * @brief Notify a device of system shutdown * * This function calls the DEVICE_SHUTDOWN() driver method if the * device currently has an attached driver. * * @returns the value returned by DEVICE_SHUTDOWN() */ int device_shutdown(device_t dev) { if (dev->state < DS_ATTACHED) return (0); return (DEVICE_SHUTDOWN(dev)); } /** * @brief Set the unit number of a device * * This function can be used to override the unit number used for a * device (e.g. to wire a device to a pre-configured unit number). */ int device_set_unit(device_t dev, int unit) { devclass_t dc; int err; dc = device_get_devclass(dev); if (unit < dc->maxunit && dc->devices[unit]) return (EBUSY); err = devclass_delete_device(dc, dev); if (err) return (err); dev->unit = unit; err = devclass_add_device(dc, dev); if (err) return (err); bus_data_generation_update(); return (0); } /*======================================*/ /* * Some useful method implementations to make life easier for bus drivers. */ void resource_init_map_request_impl(struct resource_map_request *args, size_t sz) { bzero(args, sz); args->size = sz; args->memattr = VM_MEMATTR_UNCACHEABLE; } /** * @brief Initialise a resource list. * * @param rl the resource list to initialise */ void resource_list_init(struct resource_list *rl) { STAILQ_INIT(rl); } /** * @brief Reclaim memory used by a resource list. * * This function frees the memory for all resource entries on the list * (if any). * * @param rl the resource list to free */ void resource_list_free(struct resource_list *rl) { struct resource_list_entry *rle; while ((rle = STAILQ_FIRST(rl)) != NULL) { if (rle->res) panic("resource_list_free: resource entry is busy"); STAILQ_REMOVE_HEAD(rl, link); free(rle, M_BUS); } } /** * @brief Add a resource entry. * * This function adds a resource entry using the given @p type, @p * start, @p end and @p count values. A rid value is chosen by * searching sequentially for the first unused rid starting at zero. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param start the start address of the resource * @param end the end address of the resource * @param count XXX end-start+1 */ int resource_list_add_next(struct resource_list *rl, int type, rman_res_t start, rman_res_t end, rman_res_t count) { int rid; rid = 0; while (resource_list_find(rl, type, rid) != NULL) rid++; resource_list_add(rl, type, rid, start, end, count); return (rid); } /** * @brief Add or modify a resource entry. * * If an existing entry exists with the same type and rid, it will be * modified using the given values of @p start, @p end and @p * count. If no entry exists, a new one will be created using the * given values. The resource list entry that matches is then returned. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * @param start the start address of the resource * @param end the end address of the resource * @param count XXX end-start+1 */ struct resource_list_entry * resource_list_add(struct resource_list *rl, int type, int rid, rman_res_t start, rman_res_t end, rman_res_t count) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (!rle) { rle = malloc(sizeof(struct resource_list_entry), M_BUS, M_NOWAIT); if (!rle) panic("resource_list_add: can't record entry"); STAILQ_INSERT_TAIL(rl, rle, link); rle->type = type; rle->rid = rid; rle->res = NULL; rle->flags = 0; } if (rle->res) panic("resource_list_add: resource entry is busy"); rle->start = start; rle->end = end; rle->count = count; return (rle); } /** * @brief Determine if a resource entry is busy. * * Returns true if a resource entry is busy meaning that it has an * associated resource that is not an unallocated "reserved" resource. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns Non-zero if the entry is busy, zero otherwise. */ int resource_list_busy(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (rle == NULL || rle->res == NULL) return (0); if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) { KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE), ("reserved resource is active")); return (0); } return (1); } /** * @brief Determine if a resource entry is reserved. * * Returns true if a resource entry is reserved meaning that it has an * associated "reserved" resource. The resource can either be * allocated or unallocated. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns Non-zero if the entry is reserved, zero otherwise. */ int resource_list_reserved(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (rle != NULL && rle->flags & RLE_RESERVED) return (1); return (0); } /** * @brief Find a resource entry by type and rid. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns the resource entry pointer or NULL if there is no such * entry. */ struct resource_list_entry * resource_list_find(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; STAILQ_FOREACH(rle, rl, link) { if (rle->type == type && rle->rid == rid) return (rle); } return (NULL); } /** * @brief Delete a resource entry. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier */ void resource_list_delete(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle = resource_list_find(rl, type, rid); if (rle) { if (rle->res != NULL) panic("resource_list_delete: resource has not been released"); STAILQ_REMOVE(rl, rle, resource_list_entry, link); free(rle, M_BUS); } } /** * @brief Allocate a reserved resource * * This can be used by buses to force the allocation of resources * that are always active in the system even if they are not allocated * by a driver (e.g. PCI BARs). This function is usually called when * adding a new child to the bus. The resource is allocated from the * parent bus when it is reserved. The resource list entry is marked * with RLE_RESERVED to note that it is a reserved resource. * * Subsequent attempts to allocate the resource with * resource_list_alloc() will succeed the first time and will set * RLE_ALLOCATED to note that it has been allocated. When a reserved * resource that has been allocated is released with * resource_list_release() the resource RLE_ALLOCATED is cleared, but * the actual resource remains allocated. The resource can be released to * the parent bus by calling resource_list_unreserve(). * * @param rl the resource list to allocate from * @param bus the parent device of @p child * @param child the device for which the resource is being reserved * @param type the type of resource to allocate * @param rid a pointer to the resource identifier * @param start hint at the start of the resource range - pass * @c 0 for any start address * @param end hint at the end of the resource range - pass * @c ~0 for any end address * @param count hint at the size of range required - pass @c 1 * for any size * @param flags any extra flags to control the resource * allocation - see @c RF_XXX flags in * for details * * @returns the resource which was allocated or @c NULL if no * resource could be allocated */ struct resource * resource_list_reserve(struct resource_list *rl, device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); struct resource *r; if (passthrough) panic( "resource_list_reserve() should only be called for direct children"); if (flags & RF_ACTIVE) panic( "resource_list_reserve() should only reserve inactive resources"); r = resource_list_alloc(rl, bus, child, type, rid, start, end, count, flags); if (r != NULL) { rle = resource_list_find(rl, type, *rid); rle->flags |= RLE_RESERVED; } return (r); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE() * * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list * and passing the allocation up to the parent of @p bus. This assumes * that the first entry of @c device_get_ivars(child) is a struct * resource_list. This also handles 'passthrough' allocations where a * child is a remote descendant of bus by passing the allocation up to * the parent of bus. * * Typically, a bus driver would store a list of child resources * somewhere in the child device's ivars (see device_get_ivars()) and * its implementation of BUS_ALLOC_RESOURCE() would find that list and * then call resource_list_alloc() to perform the allocation. * * @param rl the resource list to allocate from * @param bus the parent device of @p child * @param child the device which is requesting an allocation * @param type the type of resource to allocate * @param rid a pointer to the resource identifier * @param start hint at the start of the resource range - pass * @c 0 for any start address * @param end hint at the end of the resource range - pass * @c ~0 for any end address * @param count hint at the size of range required - pass @c 1 * for any size * @param flags any extra flags to control the resource * allocation - see @c RF_XXX flags in * for details * * @returns the resource which was allocated or @c NULL if no * resource could be allocated */ struct resource * resource_list_alloc(struct resource_list *rl, device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); int isdefault = RMAN_IS_DEFAULT_RANGE(start, end); if (passthrough) { return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags)); } rle = resource_list_find(rl, type, *rid); if (!rle) return (NULL); /* no resource of that type/rid */ if (rle->res) { if (rle->flags & RLE_RESERVED) { if (rle->flags & RLE_ALLOCATED) return (NULL); if ((flags & RF_ACTIVE) && bus_activate_resource(child, type, *rid, rle->res) != 0) return (NULL); rle->flags |= RLE_ALLOCATED; return (rle->res); } device_printf(bus, "resource entry %#x type %d for child %s is busy\n", *rid, type, device_get_nameunit(child)); return (NULL); } if (isdefault) { start = rle->start; count = ulmax(count, rle->count); end = ulmax(rle->end, start + count - 1); } rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags); /* * Record the new range. */ if (rle->res) { rle->start = rman_get_start(rle->res); rle->end = rman_get_end(rle->res); rle->count = count; } return (rle->res); } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE() * * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally * used with resource_list_alloc(). * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device which is requesting a release * @param type the type of resource to release * @param rid the resource identifier * @param res the resource to release * * @retval 0 success * @retval non-zero a standard unix error code indicating what * error condition prevented the operation */ int resource_list_release(struct resource_list *rl, device_t bus, device_t child, int type, int rid, struct resource *res) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); int error; if (passthrough) { return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child, type, rid, res)); } rle = resource_list_find(rl, type, rid); if (!rle) panic("resource_list_release: can't find resource"); if (!rle->res) panic("resource_list_release: resource entry is not busy"); if (rle->flags & RLE_RESERVED) { if (rle->flags & RLE_ALLOCATED) { if (rman_get_flags(res) & RF_ACTIVE) { error = bus_deactivate_resource(child, type, rid, res); if (error) return (error); } rle->flags &= ~RLE_ALLOCATED; return (0); } return (EINVAL); } error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child, type, rid, res); if (error) return (error); rle->res = NULL; return (0); } /** * @brief Release all active resources of a given type * * Release all active resources of a specified type. This is intended * to be used to cleanup resources leaked by a driver after detach or * a failed attach. * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device whose active resources are being released * @param type the type of resources to release * * @retval 0 success * @retval EBUSY at least one resource was active */ int resource_list_release_active(struct resource_list *rl, device_t bus, device_t child, int type) { struct resource_list_entry *rle; int error, retval; retval = 0; STAILQ_FOREACH(rle, rl, link) { if (rle->type != type) continue; if (rle->res == NULL) continue; if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) continue; retval = EBUSY; error = resource_list_release(rl, bus, child, type, rman_get_rid(rle->res), rle->res); if (error != 0) device_printf(bus, "Failed to release active resource: %d\n", error); } return (retval); } /** * @brief Fully release a reserved resource * * Fully releases a resource reserved via resource_list_reserve(). * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device whose reserved resource is being released * @param type the type of resource to release * @param rid the resource identifier * @param res the resource to release * * @retval 0 success * @retval non-zero a standard unix error code indicating what * error condition prevented the operation */ int resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child, int type, int rid) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); if (passthrough) panic( "resource_list_unreserve() should only be called for direct children"); rle = resource_list_find(rl, type, rid); if (!rle) panic("resource_list_unreserve: can't find resource"); if (!(rle->flags & RLE_RESERVED)) return (EINVAL); if (rle->flags & RLE_ALLOCATED) return (EBUSY); rle->flags &= ~RLE_RESERVED; return (resource_list_release(rl, bus, child, type, rid, rle->res)); } /** * @brief Print a description of resources in a resource list * * Print all resources of a specified type, for use in BUS_PRINT_CHILD(). * The name is printed if at least one resource of the given type is available. * The format is used to print resource start and end. * * @param rl the resource list to print * @param name the name of @p type, e.g. @c "memory" * @param type type type of resource entry to print * @param format printf(9) format string to print resource * start and end values * * @returns the number of characters printed */ int resource_list_print_type(struct resource_list *rl, const char *name, int type, const char *format) { struct resource_list_entry *rle; int printed, retval; printed = 0; retval = 0; /* Yes, this is kinda cheating */ STAILQ_FOREACH(rle, rl, link) { if (rle->type == type) { if (printed == 0) retval += printf(" %s ", name); else retval += printf(","); printed++; retval += printf(format, rle->start); if (rle->count > 1) { retval += printf("-"); retval += printf(format, rle->start + rle->count - 1); } } } return (retval); } /** * @brief Releases all the resources in a list. * * @param rl The resource list to purge. * * @returns nothing */ void resource_list_purge(struct resource_list *rl) { struct resource_list_entry *rle; while ((rle = STAILQ_FIRST(rl)) != NULL) { if (rle->res) bus_release_resource(rman_get_device(rle->res), rle->type, rle->rid, rle->res); STAILQ_REMOVE_HEAD(rl, link); free(rle, M_BUS); } } device_t bus_generic_add_child(device_t dev, u_int order, const char *name, int unit) { return (device_add_child_ordered(dev, order, name, unit)); } /** * @brief Helper function for implementing DEVICE_PROBE() * * This function can be used to help implement the DEVICE_PROBE() for * a bus (i.e. a device which has other devices attached to it). It * calls the DEVICE_IDENTIFY() method of each driver in the device's * devclass. */ int bus_generic_probe(device_t dev) { devclass_t dc = dev->devclass; driverlink_t dl; TAILQ_FOREACH(dl, &dc->drivers, link) { /* * If this driver's pass is too high, then ignore it. * For most drivers in the default pass, this will * never be true. For early-pass drivers they will * only call the identify routines of eligible drivers * when this routine is called. Drivers for later * passes should have their identify routines called * on early-pass buses during BUS_NEW_PASS(). */ if (dl->pass > bus_current_pass) continue; DEVICE_IDENTIFY(dl->driver, dev); } return (0); } /** * @brief Helper function for implementing DEVICE_ATTACH() * * This function can be used to help implement the DEVICE_ATTACH() for * a bus. It calls device_probe_and_attach() for each of the device's * children. */ int bus_generic_attach(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->children, link) { device_probe_and_attach(child); } return (0); } /** * @brief Helper function for implementing DEVICE_DETACH() * * This function can be used to help implement the DEVICE_DETACH() for * a bus. It calls device_detach() for each of the device's * children. */ int bus_generic_detach(device_t dev) { device_t child; int error; if (dev->state != DS_ATTACHED) return (EBUSY); TAILQ_FOREACH(child, &dev->children, link) { if ((error = device_detach(child)) != 0) return (error); } return (0); } /** * @brief Helper function for implementing DEVICE_SHUTDOWN() * * This function can be used to help implement the DEVICE_SHUTDOWN() * for a bus. It calls device_shutdown() for each of the device's * children. */ int bus_generic_shutdown(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->children, link) { device_shutdown(child); } return (0); } /** * @brief Default function for suspending a child device. * * This function is to be used by a bus's DEVICE_SUSPEND_CHILD(). */ int bus_generic_suspend_child(device_t dev, device_t child) { int error; error = DEVICE_SUSPEND(child); if (error == 0) child->flags |= DF_SUSPENDED; return (error); } /** * @brief Default function for resuming a child device. * * This function is to be used by a bus's DEVICE_RESUME_CHILD(). */ int bus_generic_resume_child(device_t dev, device_t child) { DEVICE_RESUME(child); child->flags &= ~DF_SUSPENDED; return (0); } /** * @brief Helper function for implementing DEVICE_SUSPEND() * * This function can be used to help implement the DEVICE_SUSPEND() * for a bus. It calls DEVICE_SUSPEND() for each of the device's * children. If any call to DEVICE_SUSPEND() fails, the suspend * operation is aborted and any devices which were suspended are * resumed immediately by calling their DEVICE_RESUME() methods. */ int bus_generic_suspend(device_t dev) { int error; device_t child, child2; TAILQ_FOREACH(child, &dev->children, link) { error = BUS_SUSPEND_CHILD(dev, child); if (error) { for (child2 = TAILQ_FIRST(&dev->children); child2 && child2 != child; child2 = TAILQ_NEXT(child2, link)) BUS_RESUME_CHILD(dev, child2); return (error); } } return (0); } /** * @brief Helper function for implementing DEVICE_RESUME() * * This function can be used to help implement the DEVICE_RESUME() for * a bus. It calls DEVICE_RESUME() on each of the device's children. */ int bus_generic_resume(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->children, link) { BUS_RESUME_CHILD(dev, child); /* if resume fails, there's nothing we can usefully do... */ } return (0); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints the first part of the ascii representation of * @p child, including its name, unit and description (if any - see * device_set_desc()). * * @returns the number of characters printed */ int bus_print_child_header(device_t dev, device_t child) { int retval = 0; if (device_get_desc(child)) { retval += device_printf(child, "<%s>", device_get_desc(child)); } else { retval += printf("%s", device_get_nameunit(child)); } return (retval); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints the last part of the ascii representation of * @p child, which consists of the string @c " on " followed by the * name and unit of the @p dev. * * @returns the number of characters printed */ int bus_print_child_footer(device_t dev, device_t child) { return (printf(" on %s\n", device_get_nameunit(dev))); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints out the VM domain for the given device. * * @returns the number of characters printed */ int bus_print_child_domain(device_t dev, device_t child) { int domain; /* No domain? Don't print anything */ if (BUS_GET_DOMAIN(dev, child, &domain) != 0) return (0); return (printf(" numa-domain %d", domain)); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function simply calls bus_print_child_header() followed by * bus_print_child_footer(). * * @returns the number of characters printed */ int bus_generic_print_child(device_t dev, device_t child) { int retval = 0; retval += bus_print_child_header(dev, child); retval += bus_print_child_domain(dev, child); retval += bus_print_child_footer(dev, child); return (retval); } /** * @brief Stub function for implementing BUS_READ_IVAR(). * * @returns ENOENT */ int bus_generic_read_ivar(device_t dev, device_t child, int index, uintptr_t * result) { return (ENOENT); } /** * @brief Stub function for implementing BUS_WRITE_IVAR(). * * @returns ENOENT */ int bus_generic_write_ivar(device_t dev, device_t child, int index, uintptr_t value) { return (ENOENT); } /** * @brief Stub function for implementing BUS_GET_RESOURCE_LIST(). * * @returns NULL */ struct resource_list * bus_generic_get_resource_list(device_t dev, device_t child) { return (NULL); } /** * @brief Helper function for implementing BUS_DRIVER_ADDED(). * * This implementation of BUS_DRIVER_ADDED() simply calls the driver's * DEVICE_IDENTIFY() method to allow it to add new children to the bus * and then calls device_probe_and_attach() for each unattached child. */ void bus_generic_driver_added(device_t dev, driver_t *driver) { device_t child; DEVICE_IDENTIFY(driver, dev); TAILQ_FOREACH(child, &dev->children, link) { if (child->state == DS_NOTPRESENT || (child->flags & DF_REBID)) device_probe_and_attach(child); } } /** * @brief Helper function for implementing BUS_NEW_PASS(). * * This implementing of BUS_NEW_PASS() first calls the identify * routines for any drivers that probe at the current pass. Then it * walks the list of devices for this bus. If a device is already * attached, then it calls BUS_NEW_PASS() on that device. If the * device is not already attached, it attempts to attach a driver to * it. */ void bus_generic_new_pass(device_t dev) { driverlink_t dl; devclass_t dc; device_t child; dc = dev->devclass; TAILQ_FOREACH(dl, &dc->drivers, link) { if (dl->pass == bus_current_pass) DEVICE_IDENTIFY(dl->driver, dev); } TAILQ_FOREACH(child, &dev->children, link) { if (child->state >= DS_ATTACHED) BUS_NEW_PASS(child); else if (child->state == DS_NOTPRESENT) device_probe_and_attach(child); } } /** * @brief Helper function for implementing BUS_SETUP_INTR(). * * This simple implementation of BUS_SETUP_INTR() simply calls the * BUS_SETUP_INTR() method of the parent of @p dev. */ int bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq, int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_SETUP_INTR(dev->parent, child, irq, flags, filter, intr, arg, cookiep)); return (EINVAL); } /** * @brief Helper function for implementing BUS_TEARDOWN_INTR(). * * This simple implementation of BUS_TEARDOWN_INTR() simply calls the * BUS_TEARDOWN_INTR() method of the parent of @p dev. */ int bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq, void *cookie) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie)); return (EINVAL); } /** * @brief Helper function for implementing BUS_ADJUST_RESOURCE(). * * This simple implementation of BUS_ADJUST_RESOURCE() simply calls the * BUS_ADJUST_RESOURCE() method of the parent of @p dev. */ int bus_generic_adjust_resource(device_t dev, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start, end)); return (EINVAL); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE(). * * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the * BUS_ALLOC_RESOURCE() method of the parent of @p dev. */ struct resource * bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid, start, end, count, flags)); return (NULL); } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE(). * * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the * BUS_RELEASE_RESOURCE() method of the parent of @p dev. */ int bus_generic_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE(). * * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev. */ int bus_generic_activate_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE(). * * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev. */ int bus_generic_deactivate_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_MAP_RESOURCE(). * * This simple implementation of BUS_MAP_RESOURCE() simply calls the * BUS_MAP_RESOURCE() method of the parent of @p dev. */ int bus_generic_map_resource(device_t dev, device_t child, int type, struct resource *r, struct resource_map_request *args, struct resource_map *map) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_MAP_RESOURCE(dev->parent, child, type, r, args, map)); return (EINVAL); } /** * @brief Helper function for implementing BUS_UNMAP_RESOURCE(). * * This simple implementation of BUS_UNMAP_RESOURCE() simply calls the * BUS_UNMAP_RESOURCE() method of the parent of @p dev. */ int bus_generic_unmap_resource(device_t dev, device_t child, int type, struct resource *r, struct resource_map *map) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_UNMAP_RESOURCE(dev->parent, child, type, r, map)); return (EINVAL); } /** * @brief Helper function for implementing BUS_BIND_INTR(). * * This simple implementation of BUS_BIND_INTR() simply calls the * BUS_BIND_INTR() method of the parent of @p dev. */ int bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_BIND_INTR(dev->parent, child, irq, cpu)); return (EINVAL); } /** * @brief Helper function for implementing BUS_CONFIG_INTR(). * * This simple implementation of BUS_CONFIG_INTR() simply calls the * BUS_CONFIG_INTR() method of the parent of @p dev. */ int bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig, enum intr_polarity pol) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol)); return (EINVAL); } /** * @brief Helper function for implementing BUS_DESCRIBE_INTR(). * * This simple implementation of BUS_DESCRIBE_INTR() simply calls the * BUS_DESCRIBE_INTR() method of the parent of @p dev. */ int bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq, void *cookie, const char *descr) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie, descr)); return (EINVAL); } /** * @brief Helper function for implementing BUS_GET_CPUS(). * * This simple implementation of BUS_GET_CPUS() simply calls the * BUS_GET_CPUS() method of the parent of @p dev. */ int bus_generic_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent != NULL) return (BUS_GET_CPUS(dev->parent, child, op, setsize, cpuset)); return (EINVAL); } /** * @brief Helper function for implementing BUS_GET_DMA_TAG(). * * This simple implementation of BUS_GET_DMA_TAG() simply calls the * BUS_GET_DMA_TAG() method of the parent of @p dev. */ bus_dma_tag_t bus_generic_get_dma_tag(device_t dev, device_t child) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent != NULL) return (BUS_GET_DMA_TAG(dev->parent, child)); return (NULL); } /** * @brief Helper function for implementing BUS_GET_BUS_TAG(). * * This simple implementation of BUS_GET_BUS_TAG() simply calls the * BUS_GET_BUS_TAG() method of the parent of @p dev. */ bus_space_tag_t bus_generic_get_bus_tag(device_t dev, device_t child) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent != NULL) return (BUS_GET_BUS_TAG(dev->parent, child)); return ((bus_space_tag_t)0); } /** * @brief Helper function for implementing BUS_GET_RESOURCE(). * * This implementation of BUS_GET_RESOURCE() uses the * resource_list_find() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * search. */ int bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid, rman_res_t *startp, rman_res_t *countp) { struct resource_list * rl = NULL; struct resource_list_entry * rle = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); rle = resource_list_find(rl, type, rid); if (!rle) return (ENOENT); if (startp) *startp = rle->start; if (countp) *countp = rle->count; return (0); } /** * @brief Helper function for implementing BUS_SET_RESOURCE(). * * This implementation of BUS_SET_RESOURCE() uses the * resource_list_add() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * edit. */ int bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid, rman_res_t start, rman_res_t count) { struct resource_list * rl = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); resource_list_add(rl, type, rid, start, (start + count - 1), count); return (0); } /** * @brief Helper function for implementing BUS_DELETE_RESOURCE(). * * This implementation of BUS_DELETE_RESOURCE() uses the * resource_list_delete() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * edit. */ void bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid) { struct resource_list * rl = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return; resource_list_delete(rl, type, rid); return; } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE(). * * This implementation of BUS_RELEASE_RESOURCE() uses the * resource_list_release() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list. */ int bus_generic_rl_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct resource_list * rl = NULL; if (device_get_parent(child) != dev) return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child, type, rid, r)); rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); return (resource_list_release(rl, dev, child, type, rid, r)); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE(). * * This implementation of BUS_ALLOC_RESOURCE() uses the * resource_list_alloc() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list. */ struct resource * bus_generic_rl_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource_list * rl = NULL; if (device_get_parent(child) != dev) return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child, type, rid, start, end, count, flags)); rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (NULL); return (resource_list_alloc(rl, dev, child, type, rid, start, end, count, flags)); } /** * @brief Helper function for implementing BUS_CHILD_PRESENT(). * * This simple implementation of BUS_CHILD_PRESENT() simply calls the * BUS_CHILD_PRESENT() method of the parent of @p dev. */ int bus_generic_child_present(device_t dev, device_t child) { return (BUS_CHILD_PRESENT(device_get_parent(dev), dev)); } int bus_generic_get_domain(device_t dev, device_t child, int *domain) { if (dev->parent) return (BUS_GET_DOMAIN(dev->parent, dev, domain)); return (ENOENT); } /** * @brief Helper function for implementing BUS_RESCAN(). * * This null implementation of BUS_RESCAN() always fails to indicate * the bus does not support rescanning. */ int bus_null_rescan(device_t dev) { return (ENXIO); } /* * Some convenience functions to make it easier for drivers to use the * resource-management functions. All these really do is hide the * indirection through the parent's method table, making for slightly * less-wordy code. In the future, it might make sense for this code * to maintain some sort of a list of resources allocated by each device. */ int bus_alloc_resources(device_t dev, struct resource_spec *rs, struct resource **res) { int i; for (i = 0; rs[i].type != -1; i++) res[i] = NULL; for (i = 0; rs[i].type != -1; i++) { res[i] = bus_alloc_resource_any(dev, rs[i].type, &rs[i].rid, rs[i].flags); if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) { bus_release_resources(dev, rs, res); return (ENXIO); } } return (0); } void bus_release_resources(device_t dev, const struct resource_spec *rs, struct resource **res) { int i; for (i = 0; rs[i].type != -1; i++) if (res[i] != NULL) { bus_release_resource( dev, rs[i].type, rs[i].rid, res[i]); res[i] = NULL; } } /** * @brief Wrapper function for BUS_ALLOC_RESOURCE(). * * This function simply calls the BUS_ALLOC_RESOURCE() method of the * parent of @p dev. */ struct resource * bus_alloc_resource(device_t dev, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource *res; if (dev->parent == NULL) return (NULL); res = BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end, count, flags); return (res); } /** * @brief Wrapper function for BUS_ADJUST_RESOURCE(). * * This function simply calls the BUS_ADJUST_RESOURCE() method of the * parent of @p dev. */ int bus_adjust_resource(device_t dev, int type, struct resource *r, rman_res_t start, rman_res_t end) { if (dev->parent == NULL) return (EINVAL); return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end)); } /** * @brief Wrapper function for BUS_ACTIVATE_RESOURCE(). * * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the * parent of @p dev. */ int bus_activate_resource(device_t dev, int type, int rid, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); } /** * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE(). * * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the * parent of @p dev. */ int bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); } /** * @brief Wrapper function for BUS_MAP_RESOURCE(). * * This function simply calls the BUS_MAP_RESOURCE() method of the * parent of @p dev. */ int bus_map_resource(device_t dev, int type, struct resource *r, struct resource_map_request *args, struct resource_map *map) { if (dev->parent == NULL) return (EINVAL); return (BUS_MAP_RESOURCE(dev->parent, dev, type, r, args, map)); } /** * @brief Wrapper function for BUS_UNMAP_RESOURCE(). * * This function simply calls the BUS_UNMAP_RESOURCE() method of the * parent of @p dev. */ int bus_unmap_resource(device_t dev, int type, struct resource *r, struct resource_map *map) { if (dev->parent == NULL) return (EINVAL); return (BUS_UNMAP_RESOURCE(dev->parent, dev, type, r, map)); } /** * @brief Wrapper function for BUS_RELEASE_RESOURCE(). * * This function simply calls the BUS_RELEASE_RESOURCE() method of the * parent of @p dev. */ int bus_release_resource(device_t dev, int type, int rid, struct resource *r) { int rv; if (dev->parent == NULL) return (EINVAL); rv = BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r); return (rv); } /** * @brief Wrapper function for BUS_SETUP_INTR(). * * This function simply calls the BUS_SETUP_INTR() method of the * parent of @p dev. */ int bus_setup_intr(device_t dev, struct resource *r, int flags, driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep) { int error; if (dev->parent == NULL) return (EINVAL); error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler, arg, cookiep); if (error != 0) return (error); if (handler != NULL && !(flags & INTR_MPSAFE)) device_printf(dev, "[GIANT-LOCKED]\n"); return (0); } /** * @brief Wrapper function for BUS_TEARDOWN_INTR(). * * This function simply calls the BUS_TEARDOWN_INTR() method of the * parent of @p dev. */ int bus_teardown_intr(device_t dev, struct resource *r, void *cookie) { if (dev->parent == NULL) return (EINVAL); return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie)); } /** * @brief Wrapper function for BUS_BIND_INTR(). * * This function simply calls the BUS_BIND_INTR() method of the * parent of @p dev. */ int bus_bind_intr(device_t dev, struct resource *r, int cpu) { if (dev->parent == NULL) return (EINVAL); return (BUS_BIND_INTR(dev->parent, dev, r, cpu)); } /** * @brief Wrapper function for BUS_DESCRIBE_INTR(). * * This function first formats the requested description into a * temporary buffer and then calls the BUS_DESCRIBE_INTR() method of * the parent of @p dev. */ int bus_describe_intr(device_t dev, struct resource *irq, void *cookie, const char *fmt, ...) { va_list ap; char descr[MAXCOMLEN + 1]; if (dev->parent == NULL) return (EINVAL); va_start(ap, fmt); vsnprintf(descr, sizeof(descr), fmt, ap); va_end(ap); return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr)); } /** * @brief Wrapper function for BUS_SET_RESOURCE(). * * This function simply calls the BUS_SET_RESOURCE() method of the * parent of @p dev. */ int bus_set_resource(device_t dev, int type, int rid, rman_res_t start, rman_res_t count) { return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid, start, count)); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev. */ int bus_get_resource(device_t dev, int type, int rid, rman_res_t *startp, rman_res_t *countp) { return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, startp, countp)); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev and returns the start value. */ rman_res_t bus_get_resource_start(device_t dev, int type, int rid) { rman_res_t start; rman_res_t count; int error; error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, &start, &count); if (error) return (0); return (start); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev and returns the count value. */ rman_res_t bus_get_resource_count(device_t dev, int type, int rid) { rman_res_t start; rman_res_t count; int error; error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, &start, &count); if (error) return (0); return (count); } /** * @brief Wrapper function for BUS_DELETE_RESOURCE(). * * This function simply calls the BUS_DELETE_RESOURCE() method of the * parent of @p dev. */ void bus_delete_resource(device_t dev, int type, int rid) { BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid); } /** * @brief Wrapper function for BUS_CHILD_PRESENT(). * * This function simply calls the BUS_CHILD_PRESENT() method of the * parent of @p dev. */ int bus_child_present(device_t child) { return (BUS_CHILD_PRESENT(device_get_parent(child), child)); } /** * @brief Wrapper function for BUS_CHILD_PNPINFO_STR(). * * This function simply calls the BUS_CHILD_PNPINFO_STR() method of the * parent of @p dev. */ int bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen) { device_t parent; parent = device_get_parent(child); if (parent == NULL) { *buf = '\0'; return (0); } return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen)); } /** * @brief Wrapper function for BUS_CHILD_LOCATION_STR(). * * This function simply calls the BUS_CHILD_LOCATION_STR() method of the * parent of @p dev. */ int bus_child_location_str(device_t child, char *buf, size_t buflen) { device_t parent; parent = device_get_parent(child); if (parent == NULL) { *buf = '\0'; return (0); } return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen)); } /** * @brief Wrapper function for BUS_GET_CPUS(). * * This function simply calls the BUS_GET_CPUS() method of the * parent of @p dev. */ int bus_get_cpus(device_t dev, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { device_t parent; parent = device_get_parent(dev); if (parent == NULL) return (EINVAL); return (BUS_GET_CPUS(parent, dev, op, setsize, cpuset)); } /** * @brief Wrapper function for BUS_GET_DMA_TAG(). * * This function simply calls the BUS_GET_DMA_TAG() method of the * parent of @p dev. */ bus_dma_tag_t bus_get_dma_tag(device_t dev) { device_t parent; parent = device_get_parent(dev); if (parent == NULL) return (NULL); return (BUS_GET_DMA_TAG(parent, dev)); } /** * @brief Wrapper function for BUS_GET_BUS_TAG(). * * This function simply calls the BUS_GET_BUS_TAG() method of the * parent of @p dev. */ bus_space_tag_t bus_get_bus_tag(device_t dev) { device_t parent; parent = device_get_parent(dev); if (parent == NULL) return ((bus_space_tag_t)0); return (BUS_GET_BUS_TAG(parent, dev)); } /** * @brief Wrapper function for BUS_GET_DOMAIN(). * * This function simply calls the BUS_GET_DOMAIN() method of the * parent of @p dev. */ int bus_get_domain(device_t dev, int *domain) { return (BUS_GET_DOMAIN(device_get_parent(dev), dev, domain)); } /* Resume all devices and then notify userland that we're up again. */ static int root_resume(device_t dev) { int error; error = bus_generic_resume(dev); if (error == 0) devctl_notify("kern", "power", "resume", NULL); return (error); } static int root_print_child(device_t dev, device_t child) { int retval = 0; retval += bus_print_child_header(dev, child); retval += printf("\n"); return (retval); } static int root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep) { /* * If an interrupt mapping gets to here something bad has happened. */ panic("root_setup_intr"); } /* * If we get here, assume that the device is permanent and really is * present in the system. Removable bus drivers are expected to intercept * this call long before it gets here. We return -1 so that drivers that * really care can check vs -1 or some ERRNO returned higher in the food * chain. */ static int root_child_present(device_t dev, device_t child) { return (-1); } static int root_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { switch (op) { case INTR_CPUS: /* Default to returning the set of all CPUs. */ if (setsize != sizeof(cpuset_t)) return (EINVAL); *cpuset = all_cpus; return (0); default: return (EINVAL); } } static kobj_method_t root_methods[] = { /* Device interface */ KOBJMETHOD(device_shutdown, bus_generic_shutdown), KOBJMETHOD(device_suspend, bus_generic_suspend), KOBJMETHOD(device_resume, root_resume), /* Bus interface */ KOBJMETHOD(bus_print_child, root_print_child), KOBJMETHOD(bus_read_ivar, bus_generic_read_ivar), KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar), KOBJMETHOD(bus_setup_intr, root_setup_intr), KOBJMETHOD(bus_child_present, root_child_present), KOBJMETHOD(bus_get_cpus, root_get_cpus), KOBJMETHOD_END }; static driver_t root_driver = { "root", root_methods, 1, /* no softc */ }; device_t root_bus; devclass_t root_devclass; static int root_bus_module_handler(module_t mod, int what, void* arg) { switch (what) { case MOD_LOAD: TAILQ_INIT(&bus_data_devices); kobj_class_compile((kobj_class_t) &root_driver); root_bus = make_device(NULL, "root", 0); root_bus->desc = "System root bus"; kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver); root_bus->driver = &root_driver; root_bus->state = DS_ATTACHED; root_devclass = devclass_find_internal("root", NULL, FALSE); devinit(); return (0); case MOD_SHUTDOWN: device_shutdown(root_bus); return (0); default: return (EOPNOTSUPP); } return (0); } static moduledata_t root_bus_mod = { "rootbus", root_bus_module_handler, NULL }; DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); /** * @brief Automatically configure devices * * This function begins the autoconfiguration process by calling * device_probe_and_attach() for each child of the @c root0 device. */ void root_bus_configure(void) { PDEBUG((".")); /* Eventually this will be split up, but this is sufficient for now. */ bus_set_pass(BUS_PASS_DEFAULT); } /** * @brief Module handler for registering device drivers * * This module handler is used to automatically register device * drivers when modules are loaded. If @p what is MOD_LOAD, it calls * devclass_add_driver() for the driver described by the * driver_module_data structure pointed to by @p arg */ int driver_module_handler(module_t mod, int what, void *arg) { struct driver_module_data *dmd; devclass_t bus_devclass; kobj_class_t driver; int error, pass; dmd = (struct driver_module_data *)arg; bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE); error = 0; switch (what) { case MOD_LOAD: if (dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); pass = dmd->dmd_pass; driver = dmd->dmd_driver; PDEBUG(("Loading module: driver %s on bus %s (pass %d)", DRIVERNAME(driver), dmd->dmd_busname, pass)); error = devclass_add_driver(bus_devclass, driver, pass, dmd->dmd_devclass); break; case MOD_UNLOAD: PDEBUG(("Unloading module: driver %s from bus %s", DRIVERNAME(dmd->dmd_driver), dmd->dmd_busname)); error = devclass_delete_driver(bus_devclass, dmd->dmd_driver); if (!error && dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); break; case MOD_QUIESCE: PDEBUG(("Quiesce module: driver %s from bus %s", DRIVERNAME(dmd->dmd_driver), dmd->dmd_busname)); error = devclass_quiesce_driver(bus_devclass, dmd->dmd_driver); if (!error && dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); break; default: error = EOPNOTSUPP; break; } return (error); } /** * @brief Enumerate all hinted devices for this bus. * * Walks through the hints for this bus and calls the bus_hinted_child * routine for each one it fines. It searches first for the specific * bus that's being probed for hinted children (eg isa0), and then for * generic children (eg isa). * * @param dev bus device to enumerate */ void bus_enumerate_hinted_children(device_t bus) { int i; const char *dname, *busname; int dunit; /* * enumerate all devices on the specific bus */ busname = device_get_nameunit(bus); i = 0; while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0) BUS_HINTED_CHILD(bus, dname, dunit); /* * and all the generic ones. */ busname = device_get_name(bus); i = 0; while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0) BUS_HINTED_CHILD(bus, dname, dunit); } #ifdef BUS_DEBUG /* the _short versions avoid iteration by not calling anything that prints * more than oneliners. I love oneliners. */ static void print_device_short(device_t dev, int indent) { if (!dev) return; indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n", dev->unit, dev->desc, (dev->parent? "":"no "), (TAILQ_EMPTY(&dev->children)? "no ":""), (dev->flags&DF_ENABLED? "enabled,":"disabled,"), (dev->flags&DF_FIXEDCLASS? "fixed,":""), (dev->flags&DF_WILDCARD? "wildcard,":""), (dev->flags&DF_DESCMALLOCED? "descmalloced,":""), (dev->flags&DF_REBID? "rebiddable,":""), (dev->ivars? "":"no "), (dev->softc? "":"no "), dev->busy)); } static void print_device(device_t dev, int indent) { if (!dev) return; print_device_short(dev, indent); indentprintf(("Parent:\n")); print_device_short(dev->parent, indent+1); indentprintf(("Driver:\n")); print_driver_short(dev->driver, indent+1); indentprintf(("Devclass:\n")); print_devclass_short(dev->devclass, indent+1); } void print_device_tree_short(device_t dev, int indent) /* print the device and all its children (indented) */ { device_t child; if (!dev) return; print_device_short(dev, indent); TAILQ_FOREACH(child, &dev->children, link) { print_device_tree_short(child, indent+1); } } void print_device_tree(device_t dev, int indent) /* print the device and all its children (indented) */ { device_t child; if (!dev) return; print_device(dev, indent); TAILQ_FOREACH(child, &dev->children, link) { print_device_tree(child, indent+1); } } static void print_driver_short(driver_t *driver, int indent) { if (!driver) return; indentprintf(("driver %s: softc size = %zd\n", driver->name, driver->size)); } static void print_driver(driver_t *driver, int indent) { if (!driver) return; print_driver_short(driver, indent); } static void print_driver_list(driver_list_t drivers, int indent) { driverlink_t driver; TAILQ_FOREACH(driver, &drivers, link) { print_driver(driver->driver, indent); } } static void print_devclass_short(devclass_t dc, int indent) { if ( !dc ) return; indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit)); } static void print_devclass(devclass_t dc, int indent) { int i; if ( !dc ) return; print_devclass_short(dc, indent); indentprintf(("Drivers:\n")); print_driver_list(dc->drivers, indent+1); indentprintf(("Devices:\n")); for (i = 0; i < dc->maxunit; i++) if (dc->devices[i]) print_device(dc->devices[i], indent+1); } void print_devclass_list_short(void) { devclass_t dc; printf("Short listing of devclasses, drivers & devices:\n"); TAILQ_FOREACH(dc, &devclasses, link) { print_devclass_short(dc, 0); } } void print_devclass_list(void) { devclass_t dc; printf("Full listing of devclasses, drivers & devices:\n"); TAILQ_FOREACH(dc, &devclasses, link) { print_devclass(dc, 0); } } #endif /* * User-space access to the device tree. * * We implement a small set of nodes: * * hw.bus Single integer read method to obtain the * current generation count. * hw.bus.devices Reads the entire device tree in flat space. * hw.bus.rman Resource manager interface * * We might like to add the ability to scan devclasses and/or drivers to * determine what else is currently loaded/available. */ static int sysctl_bus(SYSCTL_HANDLER_ARGS) { struct u_businfo ubus; ubus.ub_version = BUS_USER_VERSION; ubus.ub_generation = bus_data_generation; return (SYSCTL_OUT(req, &ubus, sizeof(ubus))); } SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus, "bus-related data"); static int sysctl_devices(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; int index; device_t dev; struct u_device udev; /* XXX this is a bit big */ int error; if (namelen != 2) return (EINVAL); if (bus_data_generation_check(name[0])) return (EINVAL); index = name[1]; /* * Scan the list of devices, looking for the requested index. */ TAILQ_FOREACH(dev, &bus_data_devices, devlink) { if (index-- == 0) break; } if (dev == NULL) return (ENOENT); /* * Populate the return array. */ bzero(&udev, sizeof(udev)); udev.dv_handle = (uintptr_t)dev; udev.dv_parent = (uintptr_t)dev->parent; if (dev->nameunit != NULL) strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name)); if (dev->desc != NULL) strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc)); if (dev->driver != NULL && dev->driver->name != NULL) strlcpy(udev.dv_drivername, dev->driver->name, sizeof(udev.dv_drivername)); bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo)); bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location)); udev.dv_devflags = dev->devflags; udev.dv_flags = dev->flags; udev.dv_state = dev->state; error = SYSCTL_OUT(req, &udev, sizeof(udev)); return (error); } SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices, "system device tree"); int bus_data_generation_check(int generation) { if (generation != bus_data_generation) return (1); /* XXX generate optimised lists here? */ return (0); } void bus_data_generation_update(void) { bus_data_generation++; } int bus_free_resource(device_t dev, int type, struct resource *r) { if (r == NULL) return (0); return (bus_release_resource(dev, type, rman_get_rid(r), r)); } device_t device_lookup_by_name(const char *name) { device_t dev; TAILQ_FOREACH(dev, &bus_data_devices, devlink) { if (dev->nameunit != NULL && strcmp(dev->nameunit, name) == 0) return (dev); } return (NULL); } /* * /dev/devctl2 implementation. The existing /dev/devctl device has * implicit semantics on open, so it could not be reused for this. * Another option would be to call this /dev/bus? */ static int find_device(struct devreq *req, device_t *devp) { device_t dev; /* * First, ensure that the name is nul terminated. */ if (memchr(req->dr_name, '\0', sizeof(req->dr_name)) == NULL) return (EINVAL); /* * Second, try to find an attached device whose name matches * 'name'. */ dev = device_lookup_by_name(req->dr_name); if (dev != NULL) { *devp = dev; return (0); } /* Finally, give device enumerators a chance. */ dev = NULL; EVENTHANDLER_INVOKE(dev_lookup, req->dr_name, &dev); if (dev == NULL) return (ENOENT); *devp = dev; return (0); } static bool driver_exists(device_t bus, const char *driver) { devclass_t dc; for (dc = bus->devclass; dc != NULL; dc = dc->parent) { if (devclass_find_driver_internal(dc, driver) != NULL) return (true); } return (false); } static int devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct devreq *req; device_t dev; int error, old; /* Locate the device to control. */ mtx_lock(&Giant); req = (struct devreq *)data; switch (cmd) { case DEV_ATTACH: case DEV_DETACH: case DEV_ENABLE: case DEV_DISABLE: case DEV_SUSPEND: case DEV_RESUME: case DEV_SET_DRIVER: case DEV_CLEAR_DRIVER: case DEV_RESCAN: case DEV_DELETE: error = priv_check(td, PRIV_DRIVER); if (error == 0) error = find_device(req, &dev); break; default: error = ENOTTY; break; } if (error) { mtx_unlock(&Giant); return (error); } /* Perform the requested operation. */ switch (cmd) { case DEV_ATTACH: if (device_is_attached(dev) && (dev->flags & DF_REBID) == 0) error = EBUSY; else if (!device_is_enabled(dev)) error = ENXIO; else error = device_probe_and_attach(dev); break; case DEV_DETACH: if (!device_is_attached(dev)) { error = ENXIO; break; } if (!(req->dr_flags & DEVF_FORCE_DETACH)) { error = device_quiesce(dev); if (error) break; } error = device_detach(dev); break; case DEV_ENABLE: if (device_is_enabled(dev)) { error = EBUSY; break; } /* * If the device has been probed but not attached (e.g. * when it has been disabled by a loader hint), just * attach the device rather than doing a full probe. */ device_enable(dev); if (device_is_alive(dev)) { /* * If the device was disabled via a hint, clear * the hint. */ if (resource_disabled(dev->driver->name, dev->unit)) resource_unset_value(dev->driver->name, dev->unit, "disabled"); error = device_attach(dev); } else error = device_probe_and_attach(dev); break; case DEV_DISABLE: if (!device_is_enabled(dev)) { error = ENXIO; break; } if (!(req->dr_flags & DEVF_FORCE_DETACH)) { error = device_quiesce(dev); if (error) break; } /* * Force DF_FIXEDCLASS on around detach to preserve * the existing name. */ old = dev->flags; dev->flags |= DF_FIXEDCLASS; error = device_detach(dev); if (!(old & DF_FIXEDCLASS)) dev->flags &= ~DF_FIXEDCLASS; if (error == 0) device_disable(dev); break; case DEV_SUSPEND: if (device_is_suspended(dev)) { error = EBUSY; break; } if (device_get_parent(dev) == NULL) { error = EINVAL; break; } error = BUS_SUSPEND_CHILD(device_get_parent(dev), dev); break; case DEV_RESUME: if (!device_is_suspended(dev)) { error = EINVAL; break; } if (device_get_parent(dev) == NULL) { error = EINVAL; break; } error = BUS_RESUME_CHILD(device_get_parent(dev), dev); break; case DEV_SET_DRIVER: { devclass_t dc; char driver[128]; error = copyinstr(req->dr_data, driver, sizeof(driver), NULL); if (error) break; if (driver[0] == '\0') { error = EINVAL; break; } if (dev->devclass != NULL && strcmp(driver, dev->devclass->name) == 0) /* XXX: Could possibly force DF_FIXEDCLASS on? */ break; /* * Scan drivers for this device's bus looking for at * least one matching driver. */ if (dev->parent == NULL) { error = EINVAL; break; } if (!driver_exists(dev->parent, driver)) { error = ENOENT; break; } dc = devclass_create(driver); if (dc == NULL) { error = ENOMEM; break; } /* Detach device if necessary. */ if (device_is_attached(dev)) { if (req->dr_flags & DEVF_SET_DRIVER_DETACH) error = device_detach(dev); else error = EBUSY; if (error) break; } /* Clear any previously-fixed device class and unit. */ if (dev->flags & DF_FIXEDCLASS) devclass_delete_device(dev->devclass, dev); dev->flags |= DF_WILDCARD; dev->unit = -1; /* Force the new device class. */ error = devclass_add_device(dc, dev); if (error) break; dev->flags |= DF_FIXEDCLASS; error = device_probe_and_attach(dev); break; } case DEV_CLEAR_DRIVER: if (!(dev->flags & DF_FIXEDCLASS)) { error = 0; break; } if (device_is_attached(dev)) { if (req->dr_flags & DEVF_CLEAR_DRIVER_DETACH) error = device_detach(dev); else error = EBUSY; if (error) break; } dev->flags &= ~DF_FIXEDCLASS; dev->flags |= DF_WILDCARD; devclass_delete_device(dev->devclass, dev); error = device_probe_and_attach(dev); break; case DEV_RESCAN: if (!device_is_attached(dev)) { error = ENXIO; break; } error = BUS_RESCAN(dev); break; case DEV_DELETE: { device_t parent; parent = device_get_parent(dev); if (parent == NULL) { error = EINVAL; break; } if (!(req->dr_flags & DEVF_FORCE_DELETE)) { if (bus_child_present(dev) != 0) { error = EBUSY; break; } } error = device_delete_child(parent, dev); break; } } mtx_unlock(&Giant); return (error); } static struct cdevsw devctl2_cdevsw = { .d_version = D_VERSION, .d_ioctl = devctl2_ioctl, .d_name = "devctl2", }; static void devctl2_init(void) { make_dev_credf(MAKEDEV_ETERNAL, &devctl2_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "devctl2"); } #ifdef DDB DB_SHOW_COMMAND(device, db_show_device) { device_t dev; if (!have_addr) return; dev = (device_t)addr; db_printf("name: %s\n", device_get_nameunit(dev)); db_printf(" driver: %s\n", DRIVERNAME(dev->driver)); db_printf(" class: %s\n", DEVCLANAME(dev->devclass)); db_printf(" addr: %p\n", dev); db_printf(" parent: %p\n", dev->parent); db_printf(" softc: %p\n", dev->softc); db_printf(" ivars: %p\n", dev->ivars); } DB_SHOW_ALL_COMMAND(devices, db_show_all_devices) { device_t dev; TAILQ_FOREACH(dev, &bus_data_devices, devlink) { db_show_device((db_expr_t)dev, true, count, modif); } } #endif Index: head/sys/kern/subr_bus_dma.c =================================================================== --- head/sys/kern/subr_bus_dma.c (revision 326270) +++ head/sys/kern/subr_bus_dma.c (revision 326271) @@ -1,569 +1,571 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 EMC Corp. * All rights reserved. * * Copyright (c) 1997, 1998 Justin T. Gibbs. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bus.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Load up data starting at offset within a region specified by a * list of virtual address ranges until either length or the region * are exhausted. */ static int _bus_dmamap_load_vlist(bus_dma_tag_t dmat, bus_dmamap_t map, bus_dma_segment_t *list, int sglist_cnt, struct pmap *pmap, int *nsegs, int flags, size_t offset, size_t length) { int error; error = 0; for (; sglist_cnt > 0 && length != 0; sglist_cnt--, list++) { char *addr; size_t ds_len; KASSERT((offset < list->ds_len), ("Invalid mid-segment offset")); addr = (char *)(uintptr_t)list->ds_addr + offset; ds_len = list->ds_len - offset; offset = 0; if (ds_len > length) ds_len = length; length -= ds_len; KASSERT((ds_len != 0), ("Segment length is zero")); error = _bus_dmamap_load_buffer(dmat, map, addr, ds_len, pmap, flags, NULL, nsegs); if (error) break; } return (error); } /* * Load a list of physical addresses. */ static int _bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map, bus_dma_segment_t *list, int sglist_cnt, int *nsegs, int flags) { int error; error = 0; for (; sglist_cnt > 0; sglist_cnt--, list++) { error = _bus_dmamap_load_phys(dmat, map, (vm_paddr_t)list->ds_addr, list->ds_len, flags, NULL, nsegs); if (error) break; } return (error); } /* * Load an mbuf chain. */ static int _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs, int flags) { struct mbuf *m; int error; error = 0; for (m = m0; m != NULL && error == 0; m = m->m_next) { if (m->m_len > 0) { error = _bus_dmamap_load_buffer(dmat, map, m->m_data, m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF, segs, nsegs); } } CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d", __func__, dmat, flags, error, *nsegs); return (error); } /* * Load from block io. */ static int _bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio, int *nsegs, int flags) { if ((bio->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *segs = (bus_dma_segment_t *)bio->bio_data; return (_bus_dmamap_load_vlist(dmat, map, segs, bio->bio_ma_n, kernel_pmap, nsegs, flags, bio->bio_ma_offset, bio->bio_bcount)); } if ((bio->bio_flags & BIO_UNMAPPED) != 0) return (_bus_dmamap_load_ma(dmat, map, bio->bio_ma, bio->bio_bcount, bio->bio_ma_offset, flags, NULL, nsegs)); return (_bus_dmamap_load_buffer(dmat, map, bio->bio_data, bio->bio_bcount, kernel_pmap, flags, NULL, nsegs)); } int bus_dmamap_load_ma_triv(bus_dma_tag_t dmat, bus_dmamap_t map, struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags, bus_dma_segment_t *segs, int *segp) { vm_paddr_t paddr; bus_size_t len; int error, i; error = 0; for (i = 0; tlen > 0; i++, tlen -= len) { len = min(PAGE_SIZE - ma_offs, tlen); paddr = VM_PAGE_TO_PHYS(ma[i]) + ma_offs; error = _bus_dmamap_load_phys(dmat, map, paddr, len, flags, segs, segp); if (error != 0) break; ma_offs = 0; } return (error); } /* * Load a cam control block. */ static int _bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb, int *nsegs, int flags) { struct ccb_hdr *ccb_h; void *data_ptr; int error; uint32_t dxfer_len; uint16_t sglist_cnt; error = 0; ccb_h = &ccb->ccb_h; switch (ccb_h->func_code) { case XPT_SCSI_IO: { struct ccb_scsiio *csio; csio = &ccb->csio; data_ptr = csio->data_ptr; dxfer_len = csio->dxfer_len; sglist_cnt = csio->sglist_cnt; break; } case XPT_CONT_TARGET_IO: { struct ccb_scsiio *ctio; ctio = &ccb->ctio; data_ptr = ctio->data_ptr; dxfer_len = ctio->dxfer_len; sglist_cnt = ctio->sglist_cnt; break; } case XPT_ATA_IO: { struct ccb_ataio *ataio; ataio = &ccb->ataio; data_ptr = ataio->data_ptr; dxfer_len = ataio->dxfer_len; sglist_cnt = 0; break; } case XPT_NVME_IO: case XPT_NVME_ADMIN: { struct ccb_nvmeio *nvmeio; nvmeio = &ccb->nvmeio; data_ptr = nvmeio->data_ptr; dxfer_len = nvmeio->dxfer_len; sglist_cnt = nvmeio->sglist_cnt; break; } default: panic("_bus_dmamap_load_ccb: Unsupported func code %d", ccb_h->func_code); } switch ((ccb_h->flags & CAM_DATA_MASK)) { case CAM_DATA_VADDR: error = _bus_dmamap_load_buffer(dmat, map, data_ptr, dxfer_len, kernel_pmap, flags, NULL, nsegs); break; case CAM_DATA_PADDR: error = _bus_dmamap_load_phys(dmat, map, (vm_paddr_t)(uintptr_t)data_ptr, dxfer_len, flags, NULL, nsegs); break; case CAM_DATA_SG: error = _bus_dmamap_load_vlist(dmat, map, (bus_dma_segment_t *)data_ptr, sglist_cnt, kernel_pmap, nsegs, flags, 0, dxfer_len); break; case CAM_DATA_SG_PADDR: error = _bus_dmamap_load_plist(dmat, map, (bus_dma_segment_t *)data_ptr, sglist_cnt, nsegs, flags); break; case CAM_DATA_BIO: error = _bus_dmamap_load_bio(dmat, map, (struct bio *)data_ptr, nsegs, flags); break; default: panic("_bus_dmamap_load_ccb: flags 0x%X unimplemented", ccb_h->flags); } return (error); } /* * Load a uio. */ static int _bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio, int *nsegs, int flags) { bus_size_t resid; bus_size_t minlen; struct iovec *iov; pmap_t pmap; caddr_t addr; int error, i; if (uio->uio_segflg == UIO_USERSPACE) { KASSERT(uio->uio_td != NULL, ("bus_dmamap_load_uio: USERSPACE but no proc")); pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace); } else pmap = kernel_pmap; resid = uio->uio_resid; iov = uio->uio_iov; error = 0; for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) { /* * Now at the first iovec to load. Load each iovec * until we have exhausted the residual count. */ addr = (caddr_t) iov[i].iov_base; minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len; if (minlen > 0) { error = _bus_dmamap_load_buffer(dmat, map, addr, minlen, pmap, flags, NULL, nsegs); resid -= minlen; } } return (error); } /* * Map the buffer buf into bus space using the dmamap map. */ int bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf, bus_size_t buflen, bus_dmamap_callback_t *callback, void *callback_arg, int flags) { bus_dma_segment_t *segs; struct memdesc mem; int error; int nsegs; if ((flags & BUS_DMA_NOWAIT) == 0) { mem = memdesc_vaddr(buf, buflen); _bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg); } nsegs = -1; error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, kernel_pmap, flags, NULL, &nsegs); nsegs++; CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d", __func__, dmat, flags, error, nsegs); if (error == EINPROGRESS) return (error); segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error); if (error) (*callback)(callback_arg, segs, 0, error); else (*callback)(callback_arg, segs, nsegs, 0); /* * Return ENOMEM to the caller so that it can pass it up the stack. * This error only happens when NOWAIT is set, so deferral is disabled. */ if (error == ENOMEM) return (error); return (0); } int bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0, bus_dmamap_callback2_t *callback, void *callback_arg, int flags) { bus_dma_segment_t *segs; int nsegs, error; M_ASSERTPKTHDR(m0); flags |= BUS_DMA_NOWAIT; nsegs = -1; error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, NULL, &nsegs, flags); ++nsegs; segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error); if (error) (*callback)(callback_arg, segs, 0, 0, error); else (*callback)(callback_arg, segs, nsegs, m0->m_pkthdr.len, error); CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d", __func__, dmat, flags, error, nsegs); return (error); } int bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs, int flags) { int error; flags |= BUS_DMA_NOWAIT; *nsegs = -1; error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags); ++*nsegs; _bus_dmamap_complete(dmat, map, segs, *nsegs, error); return (error); } int bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio, bus_dmamap_callback2_t *callback, void *callback_arg, int flags) { bus_dma_segment_t *segs; int nsegs, error; flags |= BUS_DMA_NOWAIT; nsegs = -1; error = _bus_dmamap_load_uio(dmat, map, uio, &nsegs, flags); nsegs++; segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error); if (error) (*callback)(callback_arg, segs, 0, 0, error); else (*callback)(callback_arg, segs, nsegs, uio->uio_resid, error); CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d", __func__, dmat, flags, error, nsegs); return (error); } int bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb, bus_dmamap_callback_t *callback, void *callback_arg, int flags) { bus_dma_segment_t *segs; struct ccb_hdr *ccb_h; struct memdesc mem; int error; int nsegs; ccb_h = &ccb->ccb_h; if ((ccb_h->flags & CAM_DIR_MASK) == CAM_DIR_NONE) { callback(callback_arg, NULL, 0, 0); return (0); } if ((flags & BUS_DMA_NOWAIT) == 0) { mem = memdesc_ccb(ccb); _bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg); } nsegs = -1; error = _bus_dmamap_load_ccb(dmat, map, ccb, &nsegs, flags); nsegs++; CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d", __func__, dmat, flags, error, nsegs); if (error == EINPROGRESS) return (error); segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error); if (error) (*callback)(callback_arg, segs, 0, error); else (*callback)(callback_arg, segs, nsegs, error); /* * Return ENOMEM to the caller so that it can pass it up the stack. * This error only happens when NOWAIT is set, so deferral is disabled. */ if (error == ENOMEM) return (error); return (0); } int bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio, bus_dmamap_callback_t *callback, void *callback_arg, int flags) { bus_dma_segment_t *segs; struct memdesc mem; int error; int nsegs; if ((flags & BUS_DMA_NOWAIT) == 0) { mem = memdesc_bio(bio); _bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg); } nsegs = -1; error = _bus_dmamap_load_bio(dmat, map, bio, &nsegs, flags); nsegs++; CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d", __func__, dmat, flags, error, nsegs); if (error == EINPROGRESS) return (error); segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error); if (error) (*callback)(callback_arg, segs, 0, error); else (*callback)(callback_arg, segs, nsegs, error); /* * Return ENOMEM to the caller so that it can pass it up the stack. * This error only happens when NOWAIT is set, so deferral is disabled. */ if (error == ENOMEM) return (error); return (0); } int bus_dmamap_load_mem(bus_dma_tag_t dmat, bus_dmamap_t map, struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg, int flags) { bus_dma_segment_t *segs; int error; int nsegs; if ((flags & BUS_DMA_NOWAIT) == 0) _bus_dmamap_waitok(dmat, map, mem, callback, callback_arg); nsegs = -1; error = 0; switch (mem->md_type) { case MEMDESC_VADDR: error = _bus_dmamap_load_buffer(dmat, map, mem->u.md_vaddr, mem->md_opaque, kernel_pmap, flags, NULL, &nsegs); break; case MEMDESC_PADDR: error = _bus_dmamap_load_phys(dmat, map, mem->u.md_paddr, mem->md_opaque, flags, NULL, &nsegs); break; case MEMDESC_VLIST: error = _bus_dmamap_load_vlist(dmat, map, mem->u.md_list, mem->md_opaque, kernel_pmap, &nsegs, flags, 0, SIZE_T_MAX); break; case MEMDESC_PLIST: error = _bus_dmamap_load_plist(dmat, map, mem->u.md_list, mem->md_opaque, &nsegs, flags); break; case MEMDESC_BIO: error = _bus_dmamap_load_bio(dmat, map, mem->u.md_bio, &nsegs, flags); break; case MEMDESC_UIO: error = _bus_dmamap_load_uio(dmat, map, mem->u.md_uio, &nsegs, flags); break; case MEMDESC_MBUF: error = _bus_dmamap_load_mbuf_sg(dmat, map, mem->u.md_mbuf, NULL, &nsegs, flags); break; case MEMDESC_CCB: error = _bus_dmamap_load_ccb(dmat, map, mem->u.md_ccb, &nsegs, flags); break; } nsegs++; CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d", __func__, dmat, flags, error, nsegs); if (error == EINPROGRESS) return (error); segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error); if (error) (*callback)(callback_arg, segs, 0, error); else (*callback)(callback_arg, segs, nsegs, 0); /* * Return ENOMEM to the caller so that it can pass it up the stack. * This error only happens when NOWAIT is set, so deferral is disabled. */ if (error == ENOMEM) return (error); return (0); } Index: head/sys/kern/subr_busdma_bufalloc.c =================================================================== --- head/sys/kern/subr_busdma_bufalloc.c (revision 326270) +++ head/sys/kern/subr_busdma_bufalloc.c (revision 326271) @@ -1,174 +1,176 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Ian Lepore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Buffer allocation support routines for bus_dmamem_alloc implementations. */ #include #include #include #include #include #include #include #include #include /* * We manage buffer zones up to a page in size. Buffers larger than a page can * be managed by one of the kernel's page-oriented memory allocation routines as * efficiently as what we can do here. Also, a page is the largest size for * which we can g'tee contiguity when using uma, and contiguity is one of the * requirements we have to fulfill. */ #define MIN_ZONE_BUFSIZE 32 #define MAX_ZONE_BUFSIZE PAGE_SIZE /* * The static array of 12 bufzones is big enough to handle all the zones for the * smallest supported allocation size of 32 through the largest supported page * size of 64K. If you up the biggest page size number, up the array size too. * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1, * but I don't know of an easy way to express that as a compile-time constant. */ #if PAGE_SIZE > 65536 #error Unsupported page size #endif struct busdma_bufalloc { bus_size_t min_size; size_t num_zones; struct busdma_bufzone buf_zones[12]; }; busdma_bufalloc_t busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment, uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags) { struct busdma_bufalloc *ba; struct busdma_bufzone *bz; int i; bus_size_t cursize; ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF, M_ZERO | M_WAITOK); ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment); /* * Each uma zone is created with an alignment of size-1, meaning that * the alignment is equal to the size (I.E., 64 byte buffers are aligned * to 64 byte boundaries, etc). This allows for a fast efficient test * when deciding whether a pool buffer meets the constraints of a given * tag used for allocation: the buffer is usable if tag->alignment <= * bufzone->size. */ for (i = 0, bz = ba->buf_zones, cursize = ba->min_size; i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE; ++i, ++bz, cursize <<= 1) { snprintf(bz->name, sizeof(bz->name), "dma %.10s %ju", name, (uintmax_t)cursize); bz->size = cursize; bz->umazone = uma_zcreate(bz->name, bz->size, NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags); if (bz->umazone == NULL) { busdma_bufalloc_destroy(ba); return (NULL); } if (alloc_func != NULL) uma_zone_set_allocf(bz->umazone, alloc_func); if (free_func != NULL) uma_zone_set_freef(bz->umazone, free_func); ++ba->num_zones; } return (ba); } void busdma_bufalloc_destroy(busdma_bufalloc_t ba) { struct busdma_bufzone *bz; int i; if (ba == NULL) return; for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) { uma_zdestroy(bz->umazone); } free(ba, M_DEVBUF); } struct busdma_bufzone * busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size) { struct busdma_bufzone *bz; int i; if (size > MAX_ZONE_BUFSIZE) return (NULL); for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) { if (bz->size >= size) return (bz); } panic("Didn't find a buffer zone of the right size"); } void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, uint8_t *pflag, int wait) { #ifdef VM_MEMATTR_UNCACHEABLE /* Inform UMA that this allocator uses kernel_arena/object. */ *pflag = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0, BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE)); #else panic("VM_MEMATTR_UNCACHEABLE unavailable"); #endif /* VM_MEMATTR_UNCACHEABLE */ } void busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, uint8_t pflag) { kmem_free(kernel_arena, (vm_offset_t)item, size); } Index: head/sys/kern/subr_capability.c =================================================================== --- head/sys/kern/subr_capability.c (revision 326270) +++ head/sys/kern/subr_capability.c (revision 326271) @@ -1,307 +1,309 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 FreeBSD Foundation * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Note that this file is compiled into the kernel and into libc. */ #include #include #ifdef _KERNEL #include #include #else /* !_KERNEL */ #include #include #include #include #include #endif #ifdef _KERNEL #define assert(exp) KASSERT((exp), ("%s:%u", __func__, __LINE__)) #endif #define CAPARSIZE_MIN (CAP_RIGHTS_VERSION_00 + 2) #define CAPARSIZE_MAX (CAP_RIGHTS_VERSION + 2) static __inline int right_to_index(uint64_t right) { static const int bit2idx[] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; int idx; idx = CAPIDXBIT(right); assert(idx >= 0 && idx < sizeof(bit2idx) / sizeof(bit2idx[0])); return (bit2idx[idx]); } static void cap_rights_vset(cap_rights_t *rights, va_list ap) { uint64_t right; int i, n; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); n = CAPARSIZE(rights); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (;;) { right = (uint64_t)va_arg(ap, unsigned long long); if (right == 0) break; assert(CAPRVER(right) == 0); i = right_to_index(right); assert(i >= 0); assert(i < n); assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); rights->cr_rights[i] |= right; assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); } } static void cap_rights_vclear(cap_rights_t *rights, va_list ap) { uint64_t right; int i, n; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); n = CAPARSIZE(rights); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (;;) { right = (uint64_t)va_arg(ap, unsigned long long); if (right == 0) break; assert(CAPRVER(right) == 0); i = right_to_index(right); assert(i >= 0); assert(i < n); assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); rights->cr_rights[i] &= ~(right & 0x01FFFFFFFFFFFFFFULL); assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); } } static bool cap_rights_is_vset(const cap_rights_t *rights, va_list ap) { uint64_t right; int i, n; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); n = CAPARSIZE(rights); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (;;) { right = (uint64_t)va_arg(ap, unsigned long long); if (right == 0) break; assert(CAPRVER(right) == 0); i = right_to_index(right); assert(i >= 0); assert(i < n); assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); if ((rights->cr_rights[i] & right) != right) return (false); } return (true); } cap_rights_t * __cap_rights_init(int version, cap_rights_t *rights, ...) { unsigned int n; va_list ap; assert(version == CAP_RIGHTS_VERSION_00); n = version + 2; assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); CAP_NONE(rights); va_start(ap, rights); cap_rights_vset(rights, ap); va_end(ap); return (rights); } cap_rights_t * __cap_rights_set(cap_rights_t *rights, ...) { va_list ap; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); va_start(ap, rights); cap_rights_vset(rights, ap); va_end(ap); return (rights); } cap_rights_t * __cap_rights_clear(cap_rights_t *rights, ...) { va_list ap; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); va_start(ap, rights); cap_rights_vclear(rights, ap); va_end(ap); return (rights); } bool __cap_rights_is_set(const cap_rights_t *rights, ...) { va_list ap; bool ret; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); va_start(ap, rights); ret = cap_rights_is_vset(rights, ap); va_end(ap); return (ret); } bool cap_rights_is_valid(const cap_rights_t *rights) { cap_rights_t allrights; int i, j; if (CAPVER(rights) != CAP_RIGHTS_VERSION_00) return (false); if (CAPARSIZE(rights) < CAPARSIZE_MIN || CAPARSIZE(rights) > CAPARSIZE_MAX) { return (false); } CAP_ALL(&allrights); if (!cap_rights_contains(&allrights, rights)) return (false); for (i = 0; i < CAPARSIZE(rights); i++) { j = right_to_index(rights->cr_rights[i]); if (i != j) return (false); if (i > 0) { if (CAPRVER(rights->cr_rights[i]) != 0) return (false); } } return (true); } cap_rights_t * cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src) { unsigned int i, n; assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00); assert(CAPVER(src) == CAP_RIGHTS_VERSION_00); assert(CAPVER(dst) == CAPVER(src)); assert(cap_rights_is_valid(src)); assert(cap_rights_is_valid(dst)); n = CAPARSIZE(dst); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (i = 0; i < n; i++) dst->cr_rights[i] |= src->cr_rights[i]; assert(cap_rights_is_valid(src)); assert(cap_rights_is_valid(dst)); return (dst); } cap_rights_t * cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src) { unsigned int i, n; assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00); assert(CAPVER(src) == CAP_RIGHTS_VERSION_00); assert(CAPVER(dst) == CAPVER(src)); assert(cap_rights_is_valid(src)); assert(cap_rights_is_valid(dst)); n = CAPARSIZE(dst); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (i = 0; i < n; i++) { dst->cr_rights[i] &= ~(src->cr_rights[i] & 0x01FFFFFFFFFFFFFFULL); } assert(cap_rights_is_valid(src)); assert(cap_rights_is_valid(dst)); return (dst); } bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little) { unsigned int i, n; assert(CAPVER(big) == CAP_RIGHTS_VERSION_00); assert(CAPVER(little) == CAP_RIGHTS_VERSION_00); assert(CAPVER(big) == CAPVER(little)); n = CAPARSIZE(big); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (i = 0; i < n; i++) { if ((big->cr_rights[i] & little->cr_rights[i]) != little->cr_rights[i]) { return (false); } } return (true); } Index: head/sys/kern/subr_counter.c =================================================================== --- head/sys/kern/subr_counter.c (revision 326270) +++ head/sys/kern/subr_counter.c (revision 326271) @@ -1,177 +1,179 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #define IN_SUBR_COUNTER_C #include void counter_u64_zero(counter_u64_t c) { counter_u64_zero_inline(c); } uint64_t counter_u64_fetch(counter_u64_t c) { return (counter_u64_fetch_inline(c)); } counter_u64_t counter_u64_alloc(int flags) { counter_u64_t r; r = uma_zalloc(pcpu_zone_64, flags); if (r != NULL) counter_u64_zero(r); return (r); } void counter_u64_free(counter_u64_t c) { uma_zfree(pcpu_zone_64, c); } int sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS) { uint64_t out; int error; out = counter_u64_fetch(*(counter_u64_t *)arg1); error = SYSCTL_OUT(req, &out, sizeof(uint64_t)); if (error || !req->newptr) return (error); /* * Any write attempt to a counter zeroes it. */ counter_u64_zero(*(counter_u64_t *)arg1); return (0); } int sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS) { uint64_t *out; int error; out = malloc(arg2 * sizeof(uint64_t), M_TEMP, M_WAITOK); for (int i = 0; i < arg2; i++) out[i] = counter_u64_fetch(((counter_u64_t *)arg1)[i]); error = SYSCTL_OUT(req, out, arg2 * sizeof(uint64_t)); free(out, M_TEMP); if (error || !req->newptr) return (error); /* * Any write attempt to a counter zeroes it. */ for (int i = 0; i < arg2; i++) counter_u64_zero(((counter_u64_t *)arg1)[i]); return (0); } /* * MP-friendly version of ppsratecheck(). * * Returns non-negative if we are in the rate, negative otherwise. * 0 - rate limit not reached. * -1 - rate limit reached. * >0 - rate limit was reached before, and was just reset. The return value * is number of events since last reset. */ int64_t counter_ratecheck(struct counter_rate *cr, int64_t limit) { int64_t val; int now; val = cr->cr_over; now = ticks; if (now - cr->cr_ticks >= hz) { /* * Time to clear the structure, we are in the next second. * First try unlocked read, and then proceed with atomic. */ if ((cr->cr_lock == 0) && atomic_cmpset_acq_int(&cr->cr_lock, 0, 1)) { /* * Check if other thread has just went through the * reset sequence before us. */ if (now - cr->cr_ticks >= hz) { val = counter_u64_fetch(cr->cr_rate); counter_u64_zero(cr->cr_rate); cr->cr_over = 0; cr->cr_ticks = now; if (val <= limit) val = 0; } atomic_store_rel_int(&cr->cr_lock, 0); } else /* * We failed to lock, in this case other thread may * be running counter_u64_zero(), so it is not safe * to do an update, we skip it. */ return (val); } counter_u64_add(cr->cr_rate, 1); if (cr->cr_over != 0) return (-1); if (counter_u64_fetch(cr->cr_rate) > limit) val = cr->cr_over = -1; return (val); } Index: head/sys/kern/subr_devstat.c =================================================================== --- head/sys/kern/subr_devstat.c (revision 326270) +++ head/sys/kern/subr_devstat.c (revision 326271) @@ -1,580 +1,582 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DEFINE(io); SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *"); SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *"); SDT_PROBE_DEFINE2(io, , , wait__start, "struct bio *", "struct devstat *"); SDT_PROBE_DEFINE2(io, , , wait__done, "struct bio *", "struct devstat *"); #define DTRACE_DEVSTAT_START() SDT_PROBE2(io, , , start, NULL, ds) #define DTRACE_DEVSTAT_BIO_START() SDT_PROBE2(io, , , start, bp, ds) #define DTRACE_DEVSTAT_DONE() SDT_PROBE2(io, , , done, NULL, ds) #define DTRACE_DEVSTAT_BIO_DONE() SDT_PROBE2(io, , , done, bp, ds) #define DTRACE_DEVSTAT_WAIT_START() SDT_PROBE2(io, , , wait__start, NULL, ds) #define DTRACE_DEVSTAT_WAIT_DONE() SDT_PROBE2(io, , , wait__done, NULL, ds) static int devstat_num_devs; static long devstat_generation = 1; static int devstat_version = DEVSTAT_VERSION; static int devstat_current_devnumber; static struct mtx devstat_mutex; MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF); static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq); static struct devstat *devstat_alloc(void); static void devstat_free(struct devstat *); static void devstat_add_entry(struct devstat *ds, const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority); /* * Allocate a devstat and initialize it */ struct devstat * devstat_new_entry(const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority) { struct devstat *ds; mtx_assert(&devstat_mutex, MA_NOTOWNED); ds = devstat_alloc(); mtx_lock(&devstat_mutex); if (unit_number == -1) { ds->unit_number = unit_number; ds->id = dev_name; binuptime(&ds->creation_time); devstat_generation++; } else { devstat_add_entry(ds, dev_name, unit_number, block_size, flags, device_type, priority); } mtx_unlock(&devstat_mutex); return (ds); } /* * Take a malloced and zeroed devstat structure given to us, fill it in * and add it to the queue of devices. */ static void devstat_add_entry(struct devstat *ds, const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority) { struct devstatlist *devstat_head; struct devstat *ds_tmp; mtx_assert(&devstat_mutex, MA_OWNED); devstat_num_devs++; devstat_head = &device_statq; /* * Priority sort. Each driver passes in its priority when it adds * its devstat entry. Drivers are sorted first by priority, and * then by probe order. * * For the first device, we just insert it, since the priority * doesn't really matter yet. Subsequent devices are inserted into * the list using the order outlined above. */ if (devstat_num_devs == 1) STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); else { STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) { struct devstat *ds_next; ds_next = STAILQ_NEXT(ds_tmp, dev_links); /* * If we find a break between higher and lower * priority items, and if this item fits in the * break, insert it. This also applies if the * "lower priority item" is the end of the list. */ if ((priority <= ds_tmp->priority) && ((ds_next == NULL) || (priority > ds_next->priority))) { STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds, dev_links); break; } else if (priority > ds_tmp->priority) { /* * If this is the case, we should be able * to insert ourselves at the head of the * list. If we can't, something is wrong. */ if (ds_tmp == STAILQ_FIRST(devstat_head)) { STAILQ_INSERT_HEAD(devstat_head, ds, dev_links); break; } else { STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); printf("devstat_add_entry: HELP! " "sorting problem detected " "for name %p unit %d\n", dev_name, unit_number); break; } } } } ds->device_number = devstat_current_devnumber++; ds->unit_number = unit_number; strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); ds->block_size = block_size; ds->flags = flags; ds->device_type = device_type; ds->priority = priority; binuptime(&ds->creation_time); devstat_generation++; } /* * Remove a devstat structure from the list of devices. */ void devstat_remove_entry(struct devstat *ds) { struct devstatlist *devstat_head; mtx_assert(&devstat_mutex, MA_NOTOWNED); if (ds == NULL) return; mtx_lock(&devstat_mutex); devstat_head = &device_statq; /* Remove this entry from the devstat queue */ atomic_add_acq_int(&ds->sequence1, 1); if (ds->unit_number != -1) { devstat_num_devs--; STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); } devstat_free(ds); devstat_generation++; mtx_unlock(&devstat_mutex); } /* * Record a transaction start. * * See comments for devstat_end_transaction(). Ordering is very important * here. */ void devstat_start_transaction(struct devstat *ds, struct bintime *now) { mtx_assert(&devstat_mutex, MA_NOTOWNED); /* sanity check */ if (ds == NULL) return; atomic_add_acq_int(&ds->sequence1, 1); /* * We only want to set the start time when we are going from idle * to busy. The start time is really the start of the latest busy * period. */ if (ds->start_count == ds->end_count) { if (now != NULL) ds->busy_from = *now; else binuptime(&ds->busy_from); } ds->start_count++; atomic_add_rel_int(&ds->sequence0, 1); DTRACE_DEVSTAT_START(); } void devstat_start_transaction_bio(struct devstat *ds, struct bio *bp) { mtx_assert(&devstat_mutex, MA_NOTOWNED); /* sanity check */ if (ds == NULL) return; binuptime(&bp->bio_t0); devstat_start_transaction(ds, &bp->bio_t0); DTRACE_DEVSTAT_BIO_START(); } /* * Record the ending of a transaction, and incrment the various counters. * * Ordering in this function, and in devstat_start_transaction() is VERY * important. The idea here is to run without locks, so we are very * careful to only modify some fields on the way "down" (i.e. at * transaction start) and some fields on the way "up" (i.e. at transaction * completion). One exception is busy_from, which we only modify in * devstat_start_transaction() when there are no outstanding transactions, * and thus it can't be modified in devstat_end_transaction() * simultaneously. * * The sequence0 and sequence1 fields are provided to enable an application * spying on the structures with mmap(2) to tell when a structure is in a * consistent state or not. * * For this to work 100% reliably, it is important that the two fields * are at opposite ends of the structure and that they are incremented * in the opposite order of how a memcpy(3) in userland would copy them. * We assume that the copying happens front to back, but there is actually * no way short of writing your own memcpy(3) replacement to guarantee * this will be the case. * * In addition to this, being a kind of locks, they must be updated with * atomic instructions using appropriate memory barriers. */ void devstat_end_transaction(struct devstat *ds, uint32_t bytes, devstat_tag_type tag_type, devstat_trans_flags flags, struct bintime *now, struct bintime *then) { struct bintime dt, lnow; /* sanity check */ if (ds == NULL) return; if (now == NULL) { now = &lnow; binuptime(now); } atomic_add_acq_int(&ds->sequence1, 1); /* Update byte and operations counts */ ds->bytes[flags] += bytes; ds->operations[flags]++; /* * Keep a count of the various tag types sent. */ if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 && tag_type != DEVSTAT_TAG_NONE) ds->tag_types[tag_type]++; if (then != NULL) { /* Update duration of operations */ dt = *now; bintime_sub(&dt, then); bintime_add(&ds->duration[flags], &dt); } /* Accumulate busy time */ dt = *now; bintime_sub(&dt, &ds->busy_from); bintime_add(&ds->busy_time, &dt); ds->busy_from = *now; ds->end_count++; atomic_add_rel_int(&ds->sequence0, 1); DTRACE_DEVSTAT_DONE(); } void devstat_end_transaction_bio(struct devstat *ds, struct bio *bp) { devstat_end_transaction_bio_bt(ds, bp, NULL); } void devstat_end_transaction_bio_bt(struct devstat *ds, struct bio *bp, struct bintime *now) { devstat_trans_flags flg; /* sanity check */ if (ds == NULL) return; if (bp->bio_cmd == BIO_DELETE) flg = DEVSTAT_FREE; else if ((bp->bio_cmd == BIO_READ) || ((bp->bio_cmd == BIO_ZONE) && (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES))) flg = DEVSTAT_READ; else if (bp->bio_cmd == BIO_WRITE) flg = DEVSTAT_WRITE; else flg = DEVSTAT_NO_DATA; devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid, DEVSTAT_TAG_SIMPLE, flg, now, &bp->bio_t0); DTRACE_DEVSTAT_BIO_DONE(); } /* * This is the sysctl handler for the devstat package. The data pushed out * on the kern.devstat.all sysctl variable consists of the current devstat * generation number, and then an array of devstat structures, one for each * device in the system. * * This is more cryptic that obvious, but basically we neither can nor * want to hold the devstat_mutex for any amount of time, so we grab it * only when we need to and keep an eye on devstat_generation all the time. */ static int sysctl_devstat(SYSCTL_HANDLER_ARGS) { int error; long mygen; struct devstat *nds; mtx_assert(&devstat_mutex, MA_NOTOWNED); /* * XXX devstat_generation should really be "volatile" but that * XXX freaks out the sysctl macro below. The places where we * XXX change it and inspect it are bracketed in the mutex which * XXX guarantees us proper write barriers. I don't believe the * XXX compiler is allowed to optimize mygen away across calls * XXX to other functions, so the following is belived to be safe. */ mygen = devstat_generation; error = SYSCTL_OUT(req, &mygen, sizeof(mygen)); if (devstat_num_devs == 0) return(0); if (error != 0) return (error); mtx_lock(&devstat_mutex); nds = STAILQ_FIRST(&device_statq); if (mygen != devstat_generation) error = EBUSY; mtx_unlock(&devstat_mutex); if (error != 0) return (error); for (;nds != NULL;) { error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); if (error != 0) return (error); mtx_lock(&devstat_mutex); if (mygen != devstat_generation) error = EBUSY; else nds = STAILQ_NEXT(nds, dev_links); mtx_unlock(&devstat_mutex); if (error != 0) return (error); } return(error); } /* * Sysctl entries for devstat. The first one is a node that all the rest * hang off of. */ static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL, "Device Statistics"); SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE, NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list"); /* * Export the number of devices in the system so that userland utilities * can determine how much memory to allocate to hold all the devices. */ SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, &devstat_num_devs, 0, "Number of devices in the devstat list"); SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, &devstat_generation, 0, "Devstat list generation"); SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, &devstat_version, 0, "Devstat list version number"); /* * Allocator for struct devstat structures. We sub-allocate these from pages * which we get from malloc. These pages are exported for mmap(2)'ing through * a miniature device driver */ #define statsperpage (PAGE_SIZE / sizeof(struct devstat)) static d_mmap_t devstat_mmap; static struct cdevsw devstat_cdevsw = { .d_version = D_VERSION, .d_mmap = devstat_mmap, .d_name = "devstat", }; struct statspage { TAILQ_ENTRY(statspage) list; struct devstat *stat; u_int nfree; }; static TAILQ_HEAD(, statspage) pagelist = TAILQ_HEAD_INITIALIZER(pagelist); static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics"); static int devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { struct statspage *spp; if (nprot != VM_PROT_READ) return (-1); mtx_lock(&devstat_mutex); TAILQ_FOREACH(spp, &pagelist, list) { if (offset == 0) { *paddr = vtophys(spp->stat); mtx_unlock(&devstat_mutex); return (0); } offset -= PAGE_SIZE; } mtx_unlock(&devstat_mutex); return (-1); } static struct devstat * devstat_alloc(void) { struct devstat *dsp; struct statspage *spp, *spp2; u_int u; static int once; mtx_assert(&devstat_mutex, MA_NOTOWNED); if (!once) { make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME, &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0444, DEVSTAT_DEVICE_NAME); once = 1; } spp2 = NULL; mtx_lock(&devstat_mutex); for (;;) { TAILQ_FOREACH(spp, &pagelist, list) { if (spp->nfree > 0) break; } if (spp != NULL) break; mtx_unlock(&devstat_mutex); spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK); spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK); spp2->nfree = statsperpage; /* * If free statspages were added while the lock was released * just reuse them. */ mtx_lock(&devstat_mutex); TAILQ_FOREACH(spp, &pagelist, list) if (spp->nfree > 0) break; if (spp == NULL) { spp = spp2; /* * It would make more sense to add the new page at the * head but the order on the list determine the * sequence of the mapping so we can't do that. */ TAILQ_INSERT_TAIL(&pagelist, spp, list); } else break; } dsp = spp->stat; for (u = 0; u < statsperpage; u++) { if (dsp->allocated == 0) break; dsp++; } spp->nfree--; dsp->allocated = 1; mtx_unlock(&devstat_mutex); if (spp2 != NULL && spp2 != spp) { free(spp2->stat, M_DEVSTAT); free(spp2, M_DEVSTAT); } return (dsp); } static void devstat_free(struct devstat *dsp) { struct statspage *spp; mtx_assert(&devstat_mutex, MA_OWNED); bzero(dsp, sizeof *dsp); TAILQ_FOREACH(spp, &pagelist, list) { if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) { spp->nfree++; return; } } } SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct devstat), "sizeof(struct devstat)"); Index: head/sys/kern/subr_dummy_vdso_tc.c =================================================================== --- head/sys/kern/subr_dummy_vdso_tc.c (revision 326270) +++ head/sys/kern/subr_dummy_vdso_tc.c (revision 326271) @@ -1,50 +1,52 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright 2012 Konstantin Belousov . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include uint32_t cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { return (0); } #ifdef COMPAT_FREEBSD32 uint32_t cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc) { return (0); } #endif Index: head/sys/kern/subr_eventhandler.c =================================================================== --- head/sys/kern/subr_eventhandler.c (revision 326270) +++ head/sys/kern/subr_eventhandler.c (revision 326271) @@ -1,315 +1,317 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records"); /* List of all eventhandler lists */ static TAILQ_HEAD(, eventhandler_list) eventhandler_lists; static int eventhandler_lists_initted = 0; static struct mtx eventhandler_mutex; struct eventhandler_entry_generic { struct eventhandler_entry ee; void (* func)(void); }; static struct eventhandler_list *_eventhandler_find_list(const char *name); /* * Initialize the eventhandler mutex and list. */ static void eventhandler_init(void *dummy __unused) { TAILQ_INIT(&eventhandler_lists); mtx_init(&eventhandler_mutex, "eventhandler", NULL, MTX_DEF); atomic_store_rel_int(&eventhandler_lists_initted, 1); } SYSINIT(eventhandlers, SI_SUB_EVENTHANDLER, SI_ORDER_FIRST, eventhandler_init, NULL); static struct eventhandler_list * eventhandler_find_or_create_list(const char *name) { struct eventhandler_list *list, *new_list; /* look for a matching, existing list */ list = _eventhandler_find_list(name); /* Do we need to create the list? */ if (list == NULL) { mtx_unlock(&eventhandler_mutex); new_list = malloc(sizeof(*new_list) + strlen(name) + 1, M_EVENTHANDLER, M_WAITOK | M_ZERO); /* If someone else created it already, then use that one. */ mtx_lock(&eventhandler_mutex); list = _eventhandler_find_list(name); if (list != NULL) { free(new_list, M_EVENTHANDLER); } else { CTR2(KTR_EVH, "%s: creating list \"%s\"", __func__, name); list = new_list; TAILQ_INIT(&list->el_entries); mtx_init(&list->el_lock, name, "eventhandler list", MTX_DEF); list->el_name = (char *)(list + 1); strcpy(list->el_name, name); TAILQ_INSERT_HEAD(&eventhandler_lists, list, el_link); } } return (list); } /* * Insertion is O(n) due to the priority scan, but optimises to O(1) * if all priorities are identical. */ static eventhandler_tag eventhandler_register_internal(struct eventhandler_list *list, const char *name, eventhandler_tag epn) { struct eventhandler_entry *ep; KASSERT(eventhandler_lists_initted, ("eventhandler registered too early")); KASSERT(epn != NULL, ("%s: cannot register NULL event", __func__)); /* Do we need to find/create the list? */ if (list == NULL) { mtx_lock(&eventhandler_mutex); list = eventhandler_find_or_create_list(name); mtx_unlock(&eventhandler_mutex); } KASSERT(epn->ee_priority != EHE_DEAD_PRIORITY, ("%s: handler for %s registered with dead priority", __func__, name)); /* sort it into the list */ CTR4(KTR_EVH, "%s: adding item %p (function %p) to \"%s\"", __func__, epn, ((struct eventhandler_entry_generic *)epn)->func, name); EHL_LOCK(list); TAILQ_FOREACH(ep, &list->el_entries, ee_link) { if (ep->ee_priority != EHE_DEAD_PRIORITY && epn->ee_priority < ep->ee_priority) { TAILQ_INSERT_BEFORE(ep, epn, ee_link); break; } } if (ep == NULL) TAILQ_INSERT_TAIL(&list->el_entries, epn, ee_link); EHL_UNLOCK(list); return(epn); } eventhandler_tag eventhandler_register(struct eventhandler_list *list, const char *name, void *func, void *arg, int priority) { struct eventhandler_entry_generic *eg; /* allocate an entry for this handler, populate it */ eg = malloc(sizeof(struct eventhandler_entry_generic), M_EVENTHANDLER, M_WAITOK | M_ZERO); eg->func = func; eg->ee.ee_arg = arg; eg->ee.ee_priority = priority; return (eventhandler_register_internal(list, name, &eg->ee)); } #ifdef VIMAGE struct eventhandler_entry_generic_vimage { struct eventhandler_entry ee; vimage_iterator_func_t func; /* Vimage iterator function. */ struct eventhandler_entry_vimage v_ee; /* Original func, arg. */ }; eventhandler_tag vimage_eventhandler_register(struct eventhandler_list *list, const char *name, void *func, void *arg, int priority, vimage_iterator_func_t iterfunc) { struct eventhandler_entry_generic_vimage *eg; /* allocate an entry for this handler, populate it */ eg = malloc(sizeof(struct eventhandler_entry_generic_vimage), M_EVENTHANDLER, M_WAITOK | M_ZERO); eg->func = iterfunc; eg->v_ee.func = func; eg->v_ee.ee_arg = arg; eg->ee.ee_arg = &eg->v_ee; eg->ee.ee_priority = priority; return (eventhandler_register_internal(list, name, &eg->ee)); } #endif static void _eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag, bool wait) { struct eventhandler_entry *ep = tag; EHL_LOCK_ASSERT(list, MA_OWNED); if (ep != NULL) { /* remove just this entry */ if (list->el_runcount == 0) { CTR3(KTR_EVH, "%s: removing item %p from \"%s\"", __func__, ep, list->el_name); TAILQ_REMOVE(&list->el_entries, ep, ee_link); free(ep, M_EVENTHANDLER); } else { CTR3(KTR_EVH, "%s: marking item %p from \"%s\" as dead", __func__, ep, list->el_name); ep->ee_priority = EHE_DEAD_PRIORITY; } } else { /* remove entire list */ if (list->el_runcount == 0) { CTR2(KTR_EVH, "%s: removing all items from \"%s\"", __func__, list->el_name); while (!TAILQ_EMPTY(&list->el_entries)) { ep = TAILQ_FIRST(&list->el_entries); TAILQ_REMOVE(&list->el_entries, ep, ee_link); free(ep, M_EVENTHANDLER); } } else { CTR2(KTR_EVH, "%s: marking all items from \"%s\" as dead", __func__, list->el_name); TAILQ_FOREACH(ep, &list->el_entries, ee_link) ep->ee_priority = EHE_DEAD_PRIORITY; } } while (wait && list->el_runcount > 0) mtx_sleep(list, &list->el_lock, 0, "evhrm", 0); EHL_UNLOCK(list); } void eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag) { _eventhandler_deregister(list, tag, true); } void eventhandler_deregister_nowait(struct eventhandler_list *list, eventhandler_tag tag) { _eventhandler_deregister(list, tag, false); } /* * Internal version for use when eventhandler list is already locked. */ static struct eventhandler_list * _eventhandler_find_list(const char *name) { struct eventhandler_list *list; mtx_assert(&eventhandler_mutex, MA_OWNED); TAILQ_FOREACH(list, &eventhandler_lists, el_link) { if (!strcmp(name, list->el_name)) break; } return (list); } /* * Lookup a "slow" list by name. Returns with the list locked. */ struct eventhandler_list * eventhandler_find_list(const char *name) { struct eventhandler_list *list; if (!eventhandler_lists_initted) return(NULL); /* scan looking for the requested list */ mtx_lock(&eventhandler_mutex); list = _eventhandler_find_list(name); if (list != NULL) EHL_LOCK(list); mtx_unlock(&eventhandler_mutex); return(list); } /* * Prune "dead" entries from an eventhandler list. */ void eventhandler_prune_list(struct eventhandler_list *list) { struct eventhandler_entry *ep, *en; int pruned = 0; CTR2(KTR_EVH, "%s: pruning list \"%s\"", __func__, list->el_name); EHL_LOCK_ASSERT(list, MA_OWNED); TAILQ_FOREACH_SAFE(ep, &list->el_entries, ee_link, en) { if (ep->ee_priority == EHE_DEAD_PRIORITY) { TAILQ_REMOVE(&list->el_entries, ep, ee_link); free(ep, M_EVENTHANDLER); pruned++; } } if (pruned > 0) wakeup(list); } /* * Create (or get the existing) list so the pointer can be stored by * EVENTHANDLER_LIST_DEFINE. */ struct eventhandler_list * eventhandler_create_list(const char *name) { struct eventhandler_list *list; KASSERT(eventhandler_lists_initted, ("eventhandler list created too early")); mtx_lock(&eventhandler_mutex); list = eventhandler_find_or_create_list(name); mtx_unlock(&eventhandler_mutex); return (list); } Index: head/sys/kern/subr_fattime.c =================================================================== --- head/sys/kern/subr_fattime.c (revision 326270) +++ head/sys/kern/subr_fattime.c (revision 326271) @@ -1,307 +1,309 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * * Convert MS-DOS FAT format timestamps to and from unix timespecs * * FAT filestamps originally consisted of two 16 bit integers, encoded like * this: * * yyyyyyymmmmddddd (year - 1980, month, day) * * hhhhhmmmmmmsssss (hour, minutes, seconds divided by two) * * Subsequently even Microsoft realized that files could be accessed in less * than two seconds and a byte was added containing: * * sfffffff (second mod two, 100ths of second) * * FAT timestamps are in the local timezone, with no indication of which * timezone much less if daylight savings time applies. * * Later on again, in Windows NT, timestamps were defined relative to GMT. * * Purists will point out that UTC replaced GMT for such uses around * half a century ago, already then. Ironically "NT" was an abbreviation of * "New Technology". Anyway... * * The 'utc' argument determines if the resulting FATTIME timestamp * should be on the UTC or local timezone calendar. * * The conversion functions below cut time into four-year leap-year * cycles rather than single years and uses table lookups inside those * cycles to get the months and years sorted out. * * Obviously we cannot calculate the correct table index going from * a posix seconds count to Y/M/D, but we can get pretty close by * dividing the daycount by 32 (giving a too low index), and then * adjusting upwards a couple of steps if necessary. * * FAT timestamps have 7 bits for the year and starts at 1980, so * they can represent up to 2107 which means that the non-leap-year * 2100 must be handled. * * XXX: As long as time_t is 32 bits this is not relevant or easily * XXX: testable. Revisit when time_t grows bigger. * XXX: grepfodder: 64 bit time_t, y2100, y2.1k, 2100, leap year * */ #include #include #include #include #define DAY (24 * 60 * 60) /* Length of day in seconds */ #define YEAR 365 /* Length of normal year */ #define LYC (4 * YEAR + 1) /* Length of 4 year leap-year cycle */ #define T1980 (10 * 365 + 2) /* Days from 1970 to 1980 */ /* End of month is N days from start of (normal) year */ #define JAN 31 #define FEB (JAN + 28) #define MAR (FEB + 31) #define APR (MAR + 30) #define MAY (APR + 31) #define JUN (MAY + 30) #define JUL (JUN + 31) #define AUG (JUL + 31) #define SEP (AUG + 30) #define OCT (SEP + 31) #define NOV (OCT + 30) #define DEC (NOV + 31) /* Table of months in a 4 year leap-year cycle */ #define ENC(y,m) (((y) << 9) | ((m) << 5)) static const struct { uint16_t days; /* month start in days relative to cycle */ uint16_t coded; /* encoded year + month information */ } mtab[48] = { { 0 + 0 * YEAR, ENC(0, 1) }, { JAN + 0 * YEAR, ENC(0, 2) }, { FEB + 0 * YEAR + 1, ENC(0, 3) }, { MAR + 0 * YEAR + 1, ENC(0, 4) }, { APR + 0 * YEAR + 1, ENC(0, 5) }, { MAY + 0 * YEAR + 1, ENC(0, 6) }, { JUN + 0 * YEAR + 1, ENC(0, 7) }, { JUL + 0 * YEAR + 1, ENC(0, 8) }, { AUG + 0 * YEAR + 1, ENC(0, 9) }, { SEP + 0 * YEAR + 1, ENC(0, 10) }, { OCT + 0 * YEAR + 1, ENC(0, 11) }, { NOV + 0 * YEAR + 1, ENC(0, 12) }, { DEC + 0 * YEAR + 1, ENC(1, 1) }, { JAN + 1 * YEAR + 1, ENC(1, 2) }, { FEB + 1 * YEAR + 1, ENC(1, 3) }, { MAR + 1 * YEAR + 1, ENC(1, 4) }, { APR + 1 * YEAR + 1, ENC(1, 5) }, { MAY + 1 * YEAR + 1, ENC(1, 6) }, { JUN + 1 * YEAR + 1, ENC(1, 7) }, { JUL + 1 * YEAR + 1, ENC(1, 8) }, { AUG + 1 * YEAR + 1, ENC(1, 9) }, { SEP + 1 * YEAR + 1, ENC(1, 10) }, { OCT + 1 * YEAR + 1, ENC(1, 11) }, { NOV + 1 * YEAR + 1, ENC(1, 12) }, { DEC + 1 * YEAR + 1, ENC(2, 1) }, { JAN + 2 * YEAR + 1, ENC(2, 2) }, { FEB + 2 * YEAR + 1, ENC(2, 3) }, { MAR + 2 * YEAR + 1, ENC(2, 4) }, { APR + 2 * YEAR + 1, ENC(2, 5) }, { MAY + 2 * YEAR + 1, ENC(2, 6) }, { JUN + 2 * YEAR + 1, ENC(2, 7) }, { JUL + 2 * YEAR + 1, ENC(2, 8) }, { AUG + 2 * YEAR + 1, ENC(2, 9) }, { SEP + 2 * YEAR + 1, ENC(2, 10) }, { OCT + 2 * YEAR + 1, ENC(2, 11) }, { NOV + 2 * YEAR + 1, ENC(2, 12) }, { DEC + 2 * YEAR + 1, ENC(3, 1) }, { JAN + 3 * YEAR + 1, ENC(3, 2) }, { FEB + 3 * YEAR + 1, ENC(3, 3) }, { MAR + 3 * YEAR + 1, ENC(3, 4) }, { APR + 3 * YEAR + 1, ENC(3, 5) }, { MAY + 3 * YEAR + 1, ENC(3, 6) }, { JUN + 3 * YEAR + 1, ENC(3, 7) }, { JUL + 3 * YEAR + 1, ENC(3, 8) }, { AUG + 3 * YEAR + 1, ENC(3, 9) }, { SEP + 3 * YEAR + 1, ENC(3, 10) }, { OCT + 3 * YEAR + 1, ENC(3, 11) }, { NOV + 3 * YEAR + 1, ENC(3, 12) } }; void timespec2fattime(struct timespec *tsp, int utc, uint16_t *ddp, uint16_t *dtp, uint8_t *dhp) { time_t t1; unsigned t2, l, m; t1 = tsp->tv_sec; if (!utc) t1 -= utc_offset(); if (dhp != NULL) *dhp = (tsp->tv_sec & 1) * 100 + tsp->tv_nsec / 10000000; if (dtp != NULL) { *dtp = (t1 / 2) % 30; *dtp |= ((t1 / 60) % 60) << 5; *dtp |= ((t1 / 3600) % 24) << 11; } if (ddp != NULL) { t2 = t1 / DAY; if (t2 < T1980) { /* Impossible date, truncate to 1980-01-01 */ *ddp = 0x0021; } else { t2 -= T1980; /* * 2100 is not a leap year. * XXX: a 32 bit time_t can not get us here. */ if (t2 >= ((2100 - 1980) / 4 * LYC + FEB)) t2++; /* Account for full leapyear cycles */ l = t2 / LYC; *ddp = (l * 4) << 9; t2 -= l * LYC; /* Find approximate table entry */ m = t2 / 32; /* Find correct table entry */ while (m < 47 && mtab[m + 1].days <= t2) m++; /* Get year + month from the table */ *ddp += mtab[m].coded; /* And apply the day in the month */ t2 -= mtab[m].days - 1; *ddp |= t2; } } } /* * Table indexed by the bottom two bits of year + four bits of the month * from the FAT timestamp, returning number of days into 4 year long * leap-year cycle */ #define DCOD(m, y, l) ((m) + YEAR * (y) + (l)) static const uint16_t daytab[64] = { 0, DCOD( 0, 0, 0), DCOD(JAN, 0, 0), DCOD(FEB, 0, 1), DCOD(MAR, 0, 1), DCOD(APR, 0, 1), DCOD(MAY, 0, 1), DCOD(JUN, 0, 1), DCOD(JUL, 0, 1), DCOD(AUG, 0, 1), DCOD(SEP, 0, 1), DCOD(OCT, 0, 1), DCOD(NOV, 0, 1), DCOD(DEC, 0, 1), 0, 0, 0, DCOD( 0, 1, 1), DCOD(JAN, 1, 1), DCOD(FEB, 1, 1), DCOD(MAR, 1, 1), DCOD(APR, 1, 1), DCOD(MAY, 1, 1), DCOD(JUN, 1, 1), DCOD(JUL, 1, 1), DCOD(AUG, 1, 1), DCOD(SEP, 1, 1), DCOD(OCT, 1, 1), DCOD(NOV, 1, 1), DCOD(DEC, 1, 1), 0, 0, 0, DCOD( 0, 2, 1), DCOD(JAN, 2, 1), DCOD(FEB, 2, 1), DCOD(MAR, 2, 1), DCOD(APR, 2, 1), DCOD(MAY, 2, 1), DCOD(JUN, 2, 1), DCOD(JUL, 2, 1), DCOD(AUG, 2, 1), DCOD(SEP, 2, 1), DCOD(OCT, 2, 1), DCOD(NOV, 2, 1), DCOD(DEC, 2, 1), 0, 0, 0, DCOD( 0, 3, 1), DCOD(JAN, 3, 1), DCOD(FEB, 3, 1), DCOD(MAR, 3, 1), DCOD(APR, 3, 1), DCOD(MAY, 3, 1), DCOD(JUN, 3, 1), DCOD(JUL, 3, 1), DCOD(AUG, 3, 1), DCOD(SEP, 3, 1), DCOD(OCT, 3, 1), DCOD(NOV, 3, 1), DCOD(DEC, 3, 1), 0, 0 }; void fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp) { unsigned day; /* Unpack time fields */ tsp->tv_sec = (dt & 0x1f) << 1; tsp->tv_sec += ((dt & 0x7e0) >> 5) * 60; tsp->tv_sec += ((dt & 0xf800) >> 11) * 3600; tsp->tv_sec += dh / 100; tsp->tv_nsec = (dh % 100) * 10000000; /* Day of month */ day = (dd & 0x1f) - 1; /* Full leap-year cycles */ day += LYC * ((dd >> 11) & 0x1f); /* Month offset from leap-year cycle */ day += daytab[(dd >> 5) & 0x3f]; /* * 2100 is not a leap year. * XXX: a 32 bit time_t can not get us here. */ if (day >= ((2100 - 1980) / 4 * LYC + FEB)) day--; /* Align with time_t epoch */ day += T1980; tsp->tv_sec += DAY * day; if (!utc) tsp->tv_sec += utc_offset(); } #ifdef TEST_DRIVER #include #include #include int main(int argc __unused, char **argv __unused) { int i; struct timespec ts; struct tm tm; double a; uint16_t d, t; uint8_t p; char buf[100]; for (i = 0; i < 10000; i++) { do { ts.tv_sec = random(); } while (ts.tv_sec < T1980 * 86400); ts.tv_nsec = random() % 1000000000; printf("%10d.%03ld -- ", ts.tv_sec, ts.tv_nsec / 1000000); gmtime_r(&ts.tv_sec, &tm); strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm); printf("%s -- ", buf); a = ts.tv_sec + ts.tv_nsec * 1e-9; d = t = p = 0; timet2fattime(&ts, &d, &t, &p); printf("%04x %04x %02x -- ", d, t, p); printf("%3d %02d %02d %02d %02d %02d -- ", ((d >> 9) & 0x7f) + 1980, (d >> 5) & 0x0f, (d >> 0) & 0x1f, (t >> 11) & 0x1f, (t >> 5) & 0x3f, ((t >> 0) & 0x1f) * 2); ts.tv_sec = ts.tv_nsec = 0; fattime2timet(d, t, p, &ts); printf("%10d.%03ld == ", ts.tv_sec, ts.tv_nsec / 1000000); gmtime_r(&ts.tv_sec, &tm); strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm); printf("%s -- ", buf); a -= ts.tv_sec + ts.tv_nsec * 1e-9; printf("%.3f", a); printf("\n"); } return (0); } #endif /* TEST_DRIVER */ Index: head/sys/kern/subr_firmware.c =================================================================== --- head/sys/kern/subr_firmware.c (revision 326270) +++ head/sys/kern/subr_firmware.c (revision 326271) @@ -1,526 +1,528 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2008, Sam Leffler * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Loadable firmware support. See sys/sys/firmware.h and firmware(9) * form more details on the subsystem. * * 'struct firmware' is the user-visible part of the firmware table. * Additional internal information is stored in a 'struct priv_fw' * (currently a static array). A slot is in use if FW_INUSE is true: */ #define FW_INUSE(p) ((p)->file != NULL || (p)->fw.name != NULL) /* * fw.name != NULL when an image is registered; file != NULL for * autoloaded images whose handling has not been completed. * * The state of a slot evolves as follows: * firmware_register --> fw.name = image_name * (autoloaded image) --> file = module reference * firmware_unregister --> fw.name = NULL * (unloadentry complete) --> file = NULL * * In order for the above to work, the 'file' field must remain * unchanged in firmware_unregister(). * * Images residing in the same module are linked to each other * through the 'parent' argument of firmware_register(). * One image (typically, one with the same name as the module to let * the autoloading mechanism work) is considered the parent image for * all other images in the same module. Children affect the refcount * on the parent image preventing improper unloading of the image itself. */ struct priv_fw { int refcnt; /* reference count */ /* * parent entry, see above. Set on firmware_register(), * cleared on firmware_unregister(). */ struct priv_fw *parent; int flags; /* record FIRMWARE_UNLOAD requests */ #define FW_UNLOAD 0x100 /* * 'file' is private info managed by the autoload/unload code. * Set at the end of firmware_get(), cleared only in the * firmware_unload_task, so the latter can depend on its value even * while the lock is not held. */ linker_file_t file; /* module file, if autoloaded */ /* * 'fw' is the externally visible image information. * We do not make it the first field in priv_fw, to avoid the * temptation of casting pointers to each other. * Use PRIV_FW(fw) to get a pointer to the cointainer of fw. * Beware, PRIV_FW does not work for a NULL pointer. */ struct firmware fw; /* externally visible information */ }; /* * PRIV_FW returns the pointer to the container of struct firmware *x. * Cast to intptr_t to override the 'const' attribute of x */ #define PRIV_FW(x) ((struct priv_fw *) \ ((intptr_t)(x) - offsetof(struct priv_fw, fw)) ) /* * At the moment we use a static array as backing store for the registry. * Should we move to a dynamic structure, keep in mind that we cannot * reallocate the array because pointers are held externally. * A list may work, though. */ #define FIRMWARE_MAX 50 static struct priv_fw firmware_table[FIRMWARE_MAX]; /* * Firmware module operations are handled in a separate task as they * might sleep and they require directory context to do i/o. */ static struct taskqueue *firmware_tq; static struct task firmware_unload_task; /* * This mutex protects accesses to the firmware table. */ static struct mtx firmware_mtx; MTX_SYSINIT(firmware, &firmware_mtx, "firmware table", MTX_DEF); /* * Helper function to lookup a name. * As a side effect, it sets the pointer to a free slot, if any. * This way we can concentrate most of the registry scanning in * this function, which makes it easier to replace the registry * with some other data structure. */ static struct priv_fw * lookup(const char *name, struct priv_fw **empty_slot) { struct priv_fw *fp = NULL; struct priv_fw *dummy; int i; if (empty_slot == NULL) empty_slot = &dummy; *empty_slot = NULL; for (i = 0; i < FIRMWARE_MAX; i++) { fp = &firmware_table[i]; if (fp->fw.name != NULL && strcasecmp(name, fp->fw.name) == 0) break; else if (!FW_INUSE(fp)) *empty_slot = fp; } return (i < FIRMWARE_MAX ) ? fp : NULL; } /* * Register a firmware image with the specified name. The * image name must not already be registered. If this is a * subimage then parent refers to a previously registered * image that this should be associated with. */ const struct firmware * firmware_register(const char *imagename, const void *data, size_t datasize, unsigned int version, const struct firmware *parent) { struct priv_fw *match, *frp; char *str; str = strdup(imagename, M_TEMP); mtx_lock(&firmware_mtx); /* * Do a lookup to make sure the name is unique or find a free slot. */ match = lookup(imagename, &frp); if (match != NULL) { mtx_unlock(&firmware_mtx); printf("%s: image %s already registered!\n", __func__, imagename); free(str, M_TEMP); return NULL; } if (frp == NULL) { mtx_unlock(&firmware_mtx); printf("%s: cannot register image %s, firmware table full!\n", __func__, imagename); free(str, M_TEMP); return NULL; } bzero(frp, sizeof(*frp)); /* start from a clean record */ frp->fw.name = str; frp->fw.data = data; frp->fw.datasize = datasize; frp->fw.version = version; if (parent != NULL) frp->parent = PRIV_FW(parent); mtx_unlock(&firmware_mtx); if (bootverbose) printf("firmware: '%s' version %u: %zu bytes loaded at %p\n", imagename, version, datasize, data); return &frp->fw; } /* * Unregister/remove a firmware image. If there are outstanding * references an error is returned and the image is not removed * from the registry. */ int firmware_unregister(const char *imagename) { struct priv_fw *fp; int err; mtx_lock(&firmware_mtx); fp = lookup(imagename, NULL); if (fp == NULL) { /* * It is ok for the lookup to fail; this can happen * when a module is unloaded on last reference and the * module unload handler unregister's each of its * firmware images. */ err = 0; } else if (fp->refcnt != 0) { /* cannot unregister */ err = EBUSY; } else { linker_file_t x = fp->file; /* save value */ /* * Clear the whole entry with bzero to make sure we * do not forget anything. Then restore 'file' which is * non-null for autoloaded images. */ free((void *) (uintptr_t) fp->fw.name, M_TEMP); bzero(fp, sizeof(struct priv_fw)); fp->file = x; err = 0; } mtx_unlock(&firmware_mtx); return err; } static void loadimage(void *arg, int npending) { struct thread *td = curthread; char *imagename = arg; struct priv_fw *fp; linker_file_t result; int error; /* synchronize with the thread that dispatched us */ mtx_lock(&firmware_mtx); mtx_unlock(&firmware_mtx); if (td->td_proc->p_fd->fd_rdir == NULL) { printf("%s: root not mounted yet, no way to load image\n", imagename); goto done; } error = linker_reference_module(imagename, NULL, &result); if (error != 0) { printf("%s: could not load firmware image, error %d\n", imagename, error); goto done; } mtx_lock(&firmware_mtx); fp = lookup(imagename, NULL); if (fp == NULL || fp->file != NULL) { mtx_unlock(&firmware_mtx); if (fp == NULL) printf("%s: firmware image loaded, " "but did not register\n", imagename); (void) linker_release_module(imagename, NULL, NULL); goto done; } fp->file = result; /* record the module identity */ mtx_unlock(&firmware_mtx); done: wakeup_one(imagename); /* we're done */ } /* * Lookup and potentially load the specified firmware image. * If the firmware is not found in the registry, try to load a kernel * module named as the image name. * If the firmware is located, a reference is returned. The caller must * release this reference for the image to be eligible for removal/unload. */ const struct firmware * firmware_get(const char *imagename) { struct task fwload_task; struct thread *td; struct priv_fw *fp; mtx_lock(&firmware_mtx); fp = lookup(imagename, NULL); if (fp != NULL) goto found; /* * Image not present, try to load the module holding it. */ td = curthread; if (priv_check(td, PRIV_FIRMWARE_LOAD) != 0 || securelevel_gt(td->td_ucred, 0) != 0) { mtx_unlock(&firmware_mtx); printf("%s: insufficient privileges to " "load firmware image %s\n", __func__, imagename); return NULL; } /* * Defer load to a thread with known context. linker_reference_module * may do filesystem i/o which requires root & current dirs, etc. * Also we must not hold any mtx's over this call which is problematic. */ if (!cold) { TASK_INIT(&fwload_task, 0, loadimage, __DECONST(void *, imagename)); taskqueue_enqueue(firmware_tq, &fwload_task); msleep(__DECONST(void *, imagename), &firmware_mtx, 0, "fwload", 0); } /* * After attempting to load the module, see if the image is registered. */ fp = lookup(imagename, NULL); if (fp == NULL) { mtx_unlock(&firmware_mtx); return NULL; } found: /* common exit point on success */ if (fp->refcnt == 0 && fp->parent != NULL) fp->parent->refcnt++; fp->refcnt++; mtx_unlock(&firmware_mtx); return &fp->fw; } /* * Release a reference to a firmware image returned by firmware_get. * The caller may specify, with the FIRMWARE_UNLOAD flag, its desire * to release the resource, but the flag is only advisory. * * If this is the last reference to the firmware image, and this is an * autoloaded module, wake up the firmware_unload_task to figure out * what to do with the associated module. */ void firmware_put(const struct firmware *p, int flags) { struct priv_fw *fp = PRIV_FW(p); mtx_lock(&firmware_mtx); fp->refcnt--; if (fp->refcnt == 0) { if (fp->parent != NULL) fp->parent->refcnt--; if (flags & FIRMWARE_UNLOAD) fp->flags |= FW_UNLOAD; if (fp->file) taskqueue_enqueue(firmware_tq, &firmware_unload_task); } mtx_unlock(&firmware_mtx); } /* * Setup directory state for the firmware_tq thread so we can do i/o. */ static void set_rootvnode(void *arg, int npending) { pwd_ensure_dirs(); free(arg, M_TEMP); } /* * Event handler called on mounting of /; bounce a task * into the task queue thread to setup it's directories. */ static void firmware_mountroot(void *arg) { struct task *setroot_task; setroot_task = malloc(sizeof(struct task), M_TEMP, M_NOWAIT); if (setroot_task != NULL) { TASK_INIT(setroot_task, 0, set_rootvnode, setroot_task); taskqueue_enqueue(firmware_tq, setroot_task); } else printf("%s: no memory for task!\n", __func__); } EVENTHANDLER_DEFINE(mountroot, firmware_mountroot, NULL, 0); /* * The body of the task in charge of unloading autoloaded modules * that are not needed anymore. * Images can be cross-linked so we may need to make multiple passes, * but the time we spend in the loop is bounded because we clear entries * as we touch them. */ static void unloadentry(void *unused1, int unused2) { int limit = FIRMWARE_MAX; int i; /* current cycle */ mtx_lock(&firmware_mtx); /* * Scan the table. limit is set to make sure we make another * full sweep after matching an entry that requires unloading. */ for (i = 0; i < limit; i++) { struct priv_fw *fp; int err; fp = &firmware_table[i % FIRMWARE_MAX]; if (fp->fw.name == NULL || fp->file == NULL || fp->refcnt != 0 || (fp->flags & FW_UNLOAD) == 0) continue; /* * Found an entry. Now: * 1. bump up limit to make sure we make another full round; * 2. clear FW_UNLOAD so we don't try this entry again. * 3. release the lock while trying to unload the module. * 'file' remains set so that the entry cannot be reused * in the meantime (it also means that fp->file will * not change while we release the lock). */ limit = i + FIRMWARE_MAX; /* make another full round */ fp->flags &= ~FW_UNLOAD; /* do not try again */ mtx_unlock(&firmware_mtx); err = linker_release_module(NULL, NULL, fp->file); mtx_lock(&firmware_mtx); /* * We rely on the module to call firmware_unregister() * on unload to actually release the entry. * If err = 0 we can drop our reference as the system * accepted it. Otherwise unloading failed (e.g. the * module itself gave an error) so our reference is * still valid. */ if (err == 0) fp->file = NULL; } mtx_unlock(&firmware_mtx); } /* * Module glue. */ static int firmware_modevent(module_t mod, int type, void *unused) { struct priv_fw *fp; int i, err; switch (type) { case MOD_LOAD: TASK_INIT(&firmware_unload_task, 0, unloadentry, NULL); firmware_tq = taskqueue_create("taskqueue_firmware", M_WAITOK, taskqueue_thread_enqueue, &firmware_tq); /* NB: use our own loop routine that sets up context */ (void) taskqueue_start_threads(&firmware_tq, 1, PWAIT, "firmware taskq"); if (rootvnode != NULL) { /* * Root is already mounted so we won't get an event; * simulate one here. */ firmware_mountroot(NULL); } return 0; case MOD_UNLOAD: /* request all autoloaded modules to be released */ mtx_lock(&firmware_mtx); for (i = 0; i < FIRMWARE_MAX; i++) { fp = &firmware_table[i]; fp->flags |= FW_UNLOAD; } mtx_unlock(&firmware_mtx); taskqueue_enqueue(firmware_tq, &firmware_unload_task); taskqueue_drain(firmware_tq, &firmware_unload_task); err = 0; for (i = 0; i < FIRMWARE_MAX; i++) { fp = &firmware_table[i]; if (fp->fw.name != NULL) { printf("%s: image %p ref %d still active slot %d\n", __func__, fp->fw.name, fp->refcnt, i); err = EINVAL; } } if (err == 0) taskqueue_free(firmware_tq); return err; } return EINVAL; } static moduledata_t firmware_mod = { "firmware", firmware_modevent, NULL }; DECLARE_MODULE(firmware, firmware_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); MODULE_VERSION(firmware, 1); Index: head/sys/kern/subr_hints.c =================================================================== --- head/sys/kern/subr_hints.c (revision 326270) +++ head/sys/kern/subr_hints.c (revision 326271) @@ -1,491 +1,493 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000,2001 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include /* * Access functions for device resources. */ static int checkmethod = 1; static int use_kenv; static char *hintp; /* * Define kern.hintmode sysctl, which only accept value 2, that cause to * switch from Static KENV mode to Dynamic KENV. So systems that have hints * compiled into kernel will be able to see/modify KENV (and hints too). */ static int sysctl_hintmode(SYSCTL_HANDLER_ARGS) { const char *cp; char *line, *eq; int eqidx, error, from_kenv, i, value; from_kenv = 0; cp = kern_envp; value = hintmode; /* Fetch candidate for new hintmode value */ error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); if (value != 2) /* Only accept swithing to hintmode 2 */ return (EINVAL); /* Migrate from static to dynamic hints */ switch (hintmode) { case 0: if (dynamic_kenv) { /* * Already here. But assign hintmode to 2, to not * check it in the future. */ hintmode = 2; return (0); } from_kenv = 1; cp = kern_envp; break; case 1: cp = static_hints; break; case 2: /* Nothing to do, hintmode already 2 */ return (0); } while (cp) { i = strlen(cp); if (i == 0) break; if (from_kenv) { if (strncmp(cp, "hint.", 5) != 0) /* kenv can have not only hints */ continue; } eq = strchr(cp, '='); if (eq == NULL) /* Bad hint value */ continue; eqidx = eq - cp; line = malloc(i+1, M_TEMP, M_WAITOK); strcpy(line, cp); line[eqidx] = '\0'; kern_setenv(line, line + eqidx + 1); free(line, M_TEMP); cp += i + 1; } hintmode = value; use_kenv = 1; return (0); } SYSCTL_PROC(_kern, OID_AUTO, hintmode, CTLTYPE_INT|CTLFLAG_RW, &hintmode, 0, sysctl_hintmode, "I", "Get/set current hintmode"); /* * Evil wildcarding resource string lookup. * This walks the supplied env string table and returns a match. * The start point can be remembered for incremental searches. */ static int res_find(int *line, int *startln, const char *name, int *unit, const char *resname, const char *value, const char **ret_name, int *ret_namelen, int *ret_unit, const char **ret_resname, int *ret_resnamelen, const char **ret_value) { int n = 0, hit, i = 0; char r_name[32]; int r_unit; char r_resname[32]; char r_value[128]; const char *s, *cp; char *p; if (checkmethod) { hintp = NULL; switch (hintmode) { case 0: /* loader hints in environment only */ break; case 1: /* static hints only */ hintp = static_hints; checkmethod = 0; break; case 2: /* fallback mode */ if (dynamic_kenv) { mtx_lock(&kenv_lock); cp = kenvp[0]; for (i = 0; cp != NULL; cp = kenvp[++i]) { if (!strncmp(cp, "hint.", 5)) { use_kenv = 1; checkmethod = 0; break; } } mtx_unlock(&kenv_lock); } else { cp = kern_envp; while (cp) { if (strncmp(cp, "hint.", 5) == 0) { cp = NULL; hintp = kern_envp; break; } while (*cp != '\0') cp++; cp++; if (*cp == '\0') { cp = NULL; hintp = static_hints; break; } } } break; default: break; } if (hintp == NULL) { if (dynamic_kenv) { use_kenv = 1; checkmethod = 0; } else hintp = kern_envp; } } if (use_kenv) { mtx_lock(&kenv_lock); i = 0; cp = kenvp[0]; if (cp == NULL) { mtx_unlock(&kenv_lock); return (ENOENT); } } else cp = hintp; while (cp) { hit = 1; (*line)++; if (strncmp(cp, "hint.", 5) != 0) hit = 0; else n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%127s", r_name, &r_unit, r_resname, r_value); if (hit && n != 4) { printf("CONFIG: invalid hint '%s'\n", cp); p = strchr(cp, 'h'); *p = 'H'; hit = 0; } if (hit && startln && *startln >= 0 && *line < *startln) hit = 0; if (hit && name && strcmp(name, r_name) != 0) hit = 0; if (hit && unit && *unit != r_unit) hit = 0; if (hit && resname && strcmp(resname, r_resname) != 0) hit = 0; if (hit && value && strcmp(value, r_value) != 0) hit = 0; if (hit) break; if (use_kenv) { cp = kenvp[++i]; if (cp == NULL) break; } else { while (*cp != '\0') cp++; cp++; if (*cp == '\0') { cp = NULL; break; } } } if (use_kenv) mtx_unlock(&kenv_lock); if (cp == NULL) return ENOENT; s = cp; /* This is a bit of a hack, but at least is reentrant */ /* Note that it returns some !unterminated! strings. */ s = strchr(s, '.') + 1; /* start of device */ if (ret_name) *ret_name = s; s = strchr(s, '.') + 1; /* start of unit */ if (ret_namelen && ret_name) *ret_namelen = s - *ret_name - 1; /* device length */ if (ret_unit) *ret_unit = r_unit; s = strchr(s, '.') + 1; /* start of resname */ if (ret_resname) *ret_resname = s; s = strchr(s, '=') + 1; /* start of value */ if (ret_resnamelen && ret_resname) *ret_resnamelen = s - *ret_resname - 1; /* value len */ if (ret_value) *ret_value = s; if (startln) /* line number for anchor */ *startln = *line + 1; return 0; } /* * Search all the data sources for matches to our query. We look for * dynamic hints first as overrides for static or fallback hints. */ static int resource_find(int *line, int *startln, const char *name, int *unit, const char *resname, const char *value, const char **ret_name, int *ret_namelen, int *ret_unit, const char **ret_resname, int *ret_resnamelen, const char **ret_value) { int i; int un; *line = 0; /* Search for exact unit matches first */ i = res_find(line, startln, name, unit, resname, value, ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen, ret_value); if (i == 0) return 0; if (unit == NULL) return ENOENT; /* If we are still here, search for wildcard matches */ un = -1; i = res_find(line, startln, name, &un, resname, value, ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen, ret_value); if (i == 0) return 0; return ENOENT; } int resource_int_value(const char *name, int unit, const char *resname, int *result) { int error; const char *str; char *op; unsigned long val; int line; line = 0; error = resource_find(&line, NULL, name, &unit, resname, NULL, NULL, NULL, NULL, NULL, NULL, &str); if (error) return error; if (*str == '\0') return EFTYPE; val = strtoul(str, &op, 0); if (*op != '\0') return EFTYPE; *result = val; return 0; } int resource_long_value(const char *name, int unit, const char *resname, long *result) { int error; const char *str; char *op; unsigned long val; int line; line = 0; error = resource_find(&line, NULL, name, &unit, resname, NULL, NULL, NULL, NULL, NULL, NULL, &str); if (error) return error; if (*str == '\0') return EFTYPE; val = strtoul(str, &op, 0); if (*op != '\0') return EFTYPE; *result = val; return 0; } int resource_string_value(const char *name, int unit, const char *resname, const char **result) { int error; const char *str; int line; line = 0; error = resource_find(&line, NULL, name, &unit, resname, NULL, NULL, NULL, NULL, NULL, NULL, &str); if (error) return error; *result = str; return 0; } /* * This is a bit nasty, but allows us to not modify the env strings. */ static const char * resource_string_copy(const char *s, int len) { static char stringbuf[256]; static int offset = 0; const char *ret; if (len == 0) len = strlen(s); if (len > 255) return NULL; if ((offset + len + 1) > 255) offset = 0; bcopy(s, &stringbuf[offset], len); stringbuf[offset + len] = '\0'; ret = &stringbuf[offset]; offset += len + 1; return ret; } /* * err = resource_find_match(&anchor, &name, &unit, resname, value) * Iteratively fetch a list of devices wired "at" something * res and value are restrictions. eg: "at", "scbus0". * For practical purposes, res = required, value = optional. * *name and *unit are set. * set *anchor to zero before starting. */ int resource_find_match(int *anchor, const char **name, int *unit, const char *resname, const char *value) { const char *found_name; int found_namelen; int found_unit; int ret; int newln; newln = *anchor; ret = resource_find(anchor, &newln, NULL, NULL, resname, value, &found_name, &found_namelen, &found_unit, NULL, NULL, NULL); if (ret == 0) { *name = resource_string_copy(found_name, found_namelen); *unit = found_unit; } *anchor = newln; return ret; } /* * err = resource_find_dev(&anchor, name, &unit, res, value); * Iterate through a list of devices, returning their unit numbers. * res and value are optional restrictions. eg: "at", "scbus0". * *unit is set to the value. * set *anchor to zero before starting. */ int resource_find_dev(int *anchor, const char *name, int *unit, const char *resname, const char *value) { int found_unit; int newln; int ret; newln = *anchor; ret = resource_find(anchor, &newln, name, NULL, resname, value, NULL, NULL, &found_unit, NULL, NULL, NULL); if (ret == 0) { *unit = found_unit; } *anchor = newln; return ret; } /* * Check to see if a device is disabled via a disabled hint. */ int resource_disabled(const char *name, int unit) { int error, value; error = resource_int_value(name, unit, "disabled", &value); if (error) return (0); return (value); } /* * Clear a value associated with a device by removing it from * the kernel environment. This only removes a hint for an * exact unit. */ int resource_unset_value(const char *name, int unit, const char *resname) { char varname[128]; const char *retname, *retvalue; int error, line; size_t len; line = 0; error = resource_find(&line, NULL, name, &unit, resname, NULL, &retname, NULL, NULL, NULL, NULL, &retvalue); if (error) return (error); retname -= strlen("hint."); len = retvalue - retname - 1; if (len > sizeof(varname) - 1) return (ENAMETOOLONG); memcpy(varname, retname, len); varname[len] = '\0'; return (kern_unsetenv(varname)); } Index: head/sys/kern/subr_kdb.c =================================================================== --- head/sys/kern/subr_kdb.c (revision 326270) +++ head/sys/kern/subr_kdb.c (revision 326271) @@ -1,675 +1,677 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_kdb.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif u_char __read_frequently kdb_active = 0; static void *kdb_jmpbufp = NULL; struct kdb_dbbe *kdb_dbbe = NULL; static struct pcb kdb_pcb; struct pcb *kdb_thrctx = NULL; struct thread *kdb_thread = NULL; struct trapframe *kdb_frame = NULL; #ifdef BREAK_TO_DEBUGGER #define KDB_BREAK_TO_DEBUGGER 1 #else #define KDB_BREAK_TO_DEBUGGER 0 #endif #ifdef ALT_BREAK_TO_DEBUGGER #define KDB_ALT_BREAK_TO_DEBUGGER 1 #else #define KDB_ALT_BREAK_TO_DEBUGGER 0 #endif static int kdb_break_to_debugger = KDB_BREAK_TO_DEBUGGER; static int kdb_alt_break_to_debugger = KDB_ALT_BREAK_TO_DEBUGGER; KDB_BACKEND(null, NULL, NULL, NULL, NULL); SET_DECLARE(kdb_dbbe_set, struct kdb_dbbe); static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_debug, OID_AUTO, kdb, CTLFLAG_RW, NULL, "KDB nodes"); SYSCTL_PROC(_debug_kdb, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, kdb_sysctl_available, "A", "list of available KDB backends"); SYSCTL_PROC(_debug_kdb, OID_AUTO, current, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, kdb_sysctl_current, "A", "currently selected KDB backend"); SYSCTL_PROC(_debug_kdb, OID_AUTO, enter, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, kdb_sysctl_enter, "I", "set to enter the debugger"); SYSCTL_PROC(_debug_kdb, OID_AUTO, panic, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, kdb_sysctl_panic, "I", "set to panic the kernel"); SYSCTL_PROC(_debug_kdb, OID_AUTO, trap, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, kdb_sysctl_trap, "I", "set to cause a page fault via data access"); SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, kdb_sysctl_trap_code, "I", "set to cause a page fault via code access"); SYSCTL_INT(_debug_kdb, OID_AUTO, break_to_debugger, CTLFLAG_RWTUN | CTLFLAG_SECURE, &kdb_break_to_debugger, 0, "Enable break to debugger"); SYSCTL_INT(_debug_kdb, OID_AUTO, alt_break_to_debugger, CTLFLAG_RWTUN | CTLFLAG_SECURE, &kdb_alt_break_to_debugger, 0, "Enable alternative break to debugger"); /* * Flag to indicate to debuggers why the debugger was entered. */ const char * volatile kdb_why = KDB_WHY_UNSET; static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS) { struct kdb_dbbe **iter; struct sbuf sbuf; int error; sbuf_new_for_sysctl(&sbuf, NULL, 64, req); SET_FOREACH(iter, kdb_dbbe_set) { if ((*iter)->dbbe_active == 0) sbuf_printf(&sbuf, "%s ", (*iter)->dbbe_name); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS) { char buf[16]; int error; if (kdb_dbbe != NULL) strlcpy(buf, kdb_dbbe->dbbe_name, sizeof(buf)); else *buf = '\0'; error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); if (kdb_active) return (EBUSY); return (kdb_dbbe_select(buf)); } static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); if (kdb_active) return (EBUSY); kdb_enter(KDB_WHY_SYSCTL, "sysctl debug.kdb.enter"); return (0); } static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); panic("kdb_sysctl_panic"); return (0); } static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS) { int error, i; int *addr = (int *)0x10; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); return (*addr); } static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS) { int error, i; void (*fp)(u_int, u_int, u_int) = (void *)0xdeadc0de; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); (*fp)(0x11111111, 0x22222222, 0x33333333); return (0); } void kdb_panic(const char *msg) { printf("KDB: panic\n"); panic("%s", msg); } void kdb_reboot(void) { printf("KDB: reboot requested\n"); shutdown_nice(0); } /* * Solaris implements a new BREAK which is initiated by a character sequence * CR ~ ^b which is similar to a familiar pattern used on Sun servers by the * Remote Console. * * Note that this function may be called from almost anywhere, with interrupts * disabled and with unknown locks held, so it must not access data other than * its arguments. Its up to the caller to ensure that the state variable is * consistent. */ #define KEY_CR 13 /* CR '\r' */ #define KEY_TILDE 126 /* ~ */ #define KEY_CRTLB 2 /* ^B */ #define KEY_CRTLP 16 /* ^P */ #define KEY_CRTLR 18 /* ^R */ /* States of th KDB "alternate break sequence" detecting state machine. */ enum { KDB_ALT_BREAK_SEEN_NONE, KDB_ALT_BREAK_SEEN_CR, KDB_ALT_BREAK_SEEN_CR_TILDE, }; int kdb_break(void) { if (!kdb_break_to_debugger) return (0); kdb_enter(KDB_WHY_BREAK, "Break to debugger"); return (KDB_REQ_DEBUGGER); } static int kdb_alt_break_state(int key, int *state) { int brk; /* All states transition to KDB_ALT_BREAK_SEEN_CR on a CR. */ if (key == KEY_CR) { *state = KDB_ALT_BREAK_SEEN_CR; return (0); } brk = 0; switch (*state) { case KDB_ALT_BREAK_SEEN_CR: *state = KDB_ALT_BREAK_SEEN_NONE; if (key == KEY_TILDE) *state = KDB_ALT_BREAK_SEEN_CR_TILDE; break; case KDB_ALT_BREAK_SEEN_CR_TILDE: *state = KDB_ALT_BREAK_SEEN_NONE; if (key == KEY_CRTLB) brk = KDB_REQ_DEBUGGER; else if (key == KEY_CRTLP) brk = KDB_REQ_PANIC; else if (key == KEY_CRTLR) brk = KDB_REQ_REBOOT; break; case KDB_ALT_BREAK_SEEN_NONE: default: *state = KDB_ALT_BREAK_SEEN_NONE; break; } return (brk); } static int kdb_alt_break_internal(int key, int *state, int force_gdb) { int brk; if (!kdb_alt_break_to_debugger) return (0); brk = kdb_alt_break_state(key, state); switch (brk) { case KDB_REQ_DEBUGGER: if (force_gdb) kdb_dbbe_select("gdb"); kdb_enter(KDB_WHY_BREAK, "Break to debugger"); break; case KDB_REQ_PANIC: if (force_gdb) kdb_dbbe_select("gdb"); kdb_panic("Panic sequence on console"); break; case KDB_REQ_REBOOT: kdb_reboot(); break; } return (0); } int kdb_alt_break(int key, int *state) { return (kdb_alt_break_internal(key, state, 0)); } /* * This variation on kdb_alt_break() is used only by dcons, which has its own * configuration flag to force GDB use regardless of the global KDB * configuration. */ int kdb_alt_break_gdb(int key, int *state) { return (kdb_alt_break_internal(key, state, 1)); } /* * Print a backtrace of the calling thread. The backtrace is generated by * the selected debugger, provided it supports backtraces. If no debugger * is selected or the current debugger does not support backtraces, this * function silently returns. */ void kdb_backtrace(void) { if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace != NULL) { printf("KDB: stack backtrace:\n"); kdb_dbbe->dbbe_trace(); } #ifdef STACK else { struct stack st; printf("KDB: stack backtrace:\n"); stack_zero(&st); stack_save(&st); stack_print_ddb(&st); } #endif } /* * Similar to kdb_backtrace() except that it prints a backtrace of an * arbitrary thread rather than the calling thread. */ void kdb_backtrace_thread(struct thread *td) { if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace_thread != NULL) { printf("KDB: stack backtrace of thread %d:\n", td->td_tid); kdb_dbbe->dbbe_trace_thread(td); } #ifdef STACK else { struct stack st; printf("KDB: stack backtrace of thread %d:\n", td->td_tid); stack_zero(&st); stack_save_td(&st, td); stack_print_ddb(&st); } #endif } /* * Set/change the current backend. */ int kdb_dbbe_select(const char *name) { struct kdb_dbbe *be, **iter; SET_FOREACH(iter, kdb_dbbe_set) { be = *iter; if (be->dbbe_active == 0 && strcmp(be->dbbe_name, name) == 0) { kdb_dbbe = be; return (0); } } return (EINVAL); } /* * Enter the currently selected debugger. If a message has been provided, * it is printed first. If the debugger does not support the enter method, * it is entered by using breakpoint(), which enters the debugger through * kdb_trap(). The 'why' argument will contain a more mechanically usable * string than 'msg', and is relied upon by DDB scripting to identify the * reason for entering the debugger so that the right script can be run. */ void kdb_enter(const char *why, const char *msg) { if (kdb_dbbe != NULL && kdb_active == 0) { if (msg != NULL) printf("KDB: enter: %s\n", msg); kdb_why = why; breakpoint(); kdb_why = KDB_WHY_UNSET; } } /* * Initialize the kernel debugger interface. */ void kdb_init(void) { struct kdb_dbbe *be, **iter; int cur_pri, pri; kdb_active = 0; kdb_dbbe = NULL; cur_pri = -1; SET_FOREACH(iter, kdb_dbbe_set) { be = *iter; pri = (be->dbbe_init != NULL) ? be->dbbe_init() : -1; be->dbbe_active = (pri >= 0) ? 0 : -1; if (pri > cur_pri) { cur_pri = pri; kdb_dbbe = be; } } if (kdb_dbbe != NULL) { printf("KDB: debugger backends:"); SET_FOREACH(iter, kdb_dbbe_set) { be = *iter; if (be->dbbe_active == 0) printf(" %s", be->dbbe_name); } printf("\n"); printf("KDB: current backend: %s\n", kdb_dbbe->dbbe_name); } } /* * Handle contexts. */ void * kdb_jmpbuf(jmp_buf new) { void *old; old = kdb_jmpbufp; kdb_jmpbufp = new; return (old); } void kdb_reenter(void) { if (!kdb_active || kdb_jmpbufp == NULL) return; printf("KDB: reentering\n"); kdb_backtrace(); longjmp(kdb_jmpbufp, 1); /* NOTREACHED */ } /* * Thread related support functions. */ struct pcb * kdb_thr_ctx(struct thread *thr) { #if defined(SMP) && defined(KDB_STOPPEDPCB) struct pcpu *pc; #endif if (thr == curthread) return (&kdb_pcb); #if defined(SMP) && defined(KDB_STOPPEDPCB) STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc->pc_curthread == thr && CPU_ISSET(pc->pc_cpuid, &stopped_cpus)) return (KDB_STOPPEDPCB(pc)); } #endif return (thr->td_pcb); } struct thread * kdb_thr_first(void) { struct proc *p; struct thread *thr; p = LIST_FIRST(&allproc); while (p != NULL) { if (p->p_flag & P_INMEM) { thr = FIRST_THREAD_IN_PROC(p); if (thr != NULL) return (thr); } p = LIST_NEXT(p, p_list); } return (NULL); } struct thread * kdb_thr_from_pid(pid_t pid) { struct proc *p; p = LIST_FIRST(&allproc); while (p != NULL) { if (p->p_flag & P_INMEM && p->p_pid == pid) return (FIRST_THREAD_IN_PROC(p)); p = LIST_NEXT(p, p_list); } return (NULL); } struct thread * kdb_thr_lookup(lwpid_t tid) { struct thread *thr; thr = kdb_thr_first(); while (thr != NULL && thr->td_tid != tid) thr = kdb_thr_next(thr); return (thr); } struct thread * kdb_thr_next(struct thread *thr) { struct proc *p; p = thr->td_proc; thr = TAILQ_NEXT(thr, td_plist); do { if (thr != NULL) return (thr); p = LIST_NEXT(p, p_list); if (p != NULL && (p->p_flag & P_INMEM)) thr = FIRST_THREAD_IN_PROC(p); } while (p != NULL); return (NULL); } int kdb_thr_select(struct thread *thr) { if (thr == NULL) return (EINVAL); kdb_thread = thr; kdb_thrctx = kdb_thr_ctx(thr); return (0); } /* * Enter the debugger due to a trap. */ int kdb_trap(int type, int code, struct trapframe *tf) { #ifdef SMP cpuset_t other_cpus; #endif struct kdb_dbbe *be; register_t intr; int handled; #ifdef SMP int did_stop_cpus; #endif be = kdb_dbbe; if (be == NULL || be->dbbe_trap == NULL) return (0); /* We reenter the debugger through kdb_reenter(). */ if (kdb_active) return (0); intr = intr_disable(); #ifdef SMP if (!SCHEDULER_STOPPED()) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); did_stop_cpus = 1; } else did_stop_cpus = 0; #endif kdb_active++; kdb_frame = tf; /* Let MD code do its thing first... */ kdb_cpu_trap(type, code); makectx(tf, &kdb_pcb); kdb_thr_select(curthread); cngrab(); for (;;) { handled = be->dbbe_trap(type, code); if (be == kdb_dbbe) break; be = kdb_dbbe; if (be == NULL || be->dbbe_trap == NULL) break; printf("Switching to %s back-end\n", be->dbbe_name); } cnungrab(); kdb_active--; #ifdef SMP if (did_stop_cpus) restart_cpus(stopped_cpus); #endif intr_restore(intr); return (handled); } Index: head/sys/kern/subr_kobj.c =================================================================== --- head/sys/kern/subr_kobj.c (revision 326270) +++ head/sys/kern/subr_kobj.c (revision 326271) @@ -1,340 +1,342 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000,2003 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #ifndef TEST #include #endif #ifdef TEST #include "usertest.h" #endif static MALLOC_DEFINE(M_KOBJ, "kobj", "Kernel object structures"); #ifdef KOBJ_STATS u_int kobj_lookup_hits; u_int kobj_lookup_misses; SYSCTL_UINT(_kern, OID_AUTO, kobj_hits, CTLFLAG_RD, &kobj_lookup_hits, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, kobj_misses, CTLFLAG_RD, &kobj_lookup_misses, 0, ""); #endif static struct mtx kobj_mtx; static int kobj_mutex_inited; static int kobj_next_id = 1; #define KOBJ_LOCK() mtx_lock(&kobj_mtx) #define KOBJ_UNLOCK() mtx_unlock(&kobj_mtx) #define KOBJ_ASSERT(what) mtx_assert(&kobj_mtx, what); SYSCTL_INT(_kern, OID_AUTO, kobj_methodcount, CTLFLAG_RD, &kobj_next_id, 0, ""); static void kobj_init_mutex(void *arg) { if (!kobj_mutex_inited) { mtx_init(&kobj_mtx, "kobj", NULL, MTX_DEF); kobj_mutex_inited = 1; } } SYSINIT(kobj, SI_SUB_LOCK, SI_ORDER_ANY, kobj_init_mutex, NULL); /* * This method structure is used to initialise new caches. Since the * desc pointer is NULL, it is guaranteed never to match any read * descriptors. */ static const struct kobj_method null_method = { 0, 0, }; int kobj_error_method(void) { return ENXIO; } static void kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops) { kobj_method_t *m; int i; /* * Don't do anything if we are already compiled. */ if (cls->ops) return; /* * First register any methods which need it. */ for (i = 0, m = cls->methods; m->desc; i++, m++) { if (m->desc->id == 0) m->desc->id = kobj_next_id++; } /* * Then initialise the ops table. */ for (i = 0; i < KOBJ_CACHE_SIZE; i++) ops->cache[i] = &null_method; ops->cls = cls; cls->ops = ops; } void kobj_class_compile(kobj_class_t cls) { kobj_ops_t ops; KOBJ_ASSERT(MA_NOTOWNED); /* * Allocate space for the compiled ops table. */ ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT); if (!ops) panic("%s: out of memory", __func__); KOBJ_LOCK(); /* * We may have lost a race for kobj_class_compile here - check * to make sure someone else hasn't already compiled this * class. */ if (cls->ops) { KOBJ_UNLOCK(); free(ops, M_KOBJ); return; } kobj_class_compile_common(cls, ops); KOBJ_UNLOCK(); } void kobj_class_compile_static(kobj_class_t cls, kobj_ops_t ops) { KASSERT(kobj_mutex_inited == 0, ("%s: only supported during early cycles", __func__)); /* * Increment refs to make sure that the ops table is not freed. */ cls->refs++; kobj_class_compile_common(cls, ops); } static kobj_method_t* kobj_lookup_method_class(kobj_class_t cls, kobjop_desc_t desc) { kobj_method_t *methods = cls->methods; kobj_method_t *ce; for (ce = methods; ce && ce->desc; ce++) { if (ce->desc == desc) { return ce; } } return NULL; } static kobj_method_t* kobj_lookup_method_mi(kobj_class_t cls, kobjop_desc_t desc) { kobj_method_t *ce; kobj_class_t *basep; ce = kobj_lookup_method_class(cls, desc); if (ce) return ce; basep = cls->baseclasses; if (basep) { for (; *basep; basep++) { ce = kobj_lookup_method_mi(*basep, desc); if (ce) return ce; } } return NULL; } kobj_method_t* kobj_lookup_method(kobj_class_t cls, kobj_method_t **cep, kobjop_desc_t desc) { kobj_method_t *ce; ce = kobj_lookup_method_mi(cls, desc); if (!ce) ce = &desc->deflt; if (cep) *cep = ce; return ce; } void kobj_class_free(kobj_class_t cls) { void* ops = NULL; KOBJ_ASSERT(MA_NOTOWNED); KOBJ_LOCK(); /* * Protect against a race between kobj_create and * kobj_delete. */ if (cls->refs == 0) { /* * For now we don't do anything to unregister any methods * which are no longer used. */ /* * Free memory and clean up. */ ops = cls->ops; cls->ops = NULL; } KOBJ_UNLOCK(); if (ops) free(ops, M_KOBJ); } kobj_t kobj_create(kobj_class_t cls, struct malloc_type *mtype, int mflags) { kobj_t obj; /* * Allocate and initialise the new object. */ obj = malloc(cls->size, mtype, mflags | M_ZERO); if (!obj) return NULL; kobj_init(obj, cls); return obj; } static void kobj_init_common(kobj_t obj, kobj_class_t cls) { obj->ops = cls->ops; cls->refs++; } void kobj_init(kobj_t obj, kobj_class_t cls) { KOBJ_ASSERT(MA_NOTOWNED); retry: KOBJ_LOCK(); /* * Consider compiling the class' method table. */ if (!cls->ops) { /* * kobj_class_compile doesn't want the lock held * because of the call to malloc - we drop the lock * and re-try. */ KOBJ_UNLOCK(); kobj_class_compile(cls); goto retry; } kobj_init_common(obj, cls); KOBJ_UNLOCK(); } void kobj_init_static(kobj_t obj, kobj_class_t cls) { KASSERT(kobj_mutex_inited == 0, ("%s: only supported during early cycles", __func__)); kobj_init_common(obj, cls); } void kobj_delete(kobj_t obj, struct malloc_type *mtype) { kobj_class_t cls = obj->ops->cls; int refs; /* * Consider freeing the compiled method table for the class * after its last instance is deleted. As an optimisation, we * should defer this for a short while to avoid thrashing. */ KOBJ_ASSERT(MA_NOTOWNED); KOBJ_LOCK(); cls->refs--; refs = cls->refs; KOBJ_UNLOCK(); if (!refs) kobj_class_free(cls); obj->ops = NULL; if (mtype) free(obj, mtype); } Index: head/sys/kern/subr_lock.c =================================================================== --- head/sys/kern/subr_lock.c (revision 326270) +++ head/sys/kern/subr_lock.c (revision 326271) @@ -1,700 +1,702 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and functions used to maintain * lock_object structures. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mprof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include SDT_PROVIDER_DEFINE(lock); SDT_PROBE_DEFINE1(lock, , , starvation, "u_int"); CTASSERT(LOCK_CLASS_MAX == 15); struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = { &lock_class_mtx_spin, &lock_class_mtx_sleep, &lock_class_sx, &lock_class_rm, &lock_class_rm_sleepable, &lock_class_rw, &lock_class_lockmgr, }; void lock_init(struct lock_object *lock, struct lock_class *class, const char *name, const char *type, int flags) { int i; /* Check for double-init and zero object. */ KASSERT(flags & LO_NEW || !lock_initialized(lock), ("lock \"%s\" %p already initialized", name, lock)); /* Look up lock class to find its index. */ for (i = 0; i < LOCK_CLASS_MAX; i++) if (lock_classes[i] == class) { lock->lo_flags = i << LO_CLASSSHIFT; break; } KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class)); /* Initialize the lock object. */ lock->lo_name = name; lock->lo_flags |= flags | LO_INITIALIZED; LOCK_LOG_INIT(lock, 0); WITNESS_INIT(lock, (type != NULL) ? type : name); } void lock_destroy(struct lock_object *lock) { KASSERT(lock_initialized(lock), ("lock %p is not initialized", lock)); WITNESS_DESTROY(lock); LOCK_LOG_DESTROY(lock, 0); lock->lo_flags &= ~LO_INITIALIZED; } static SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging"); static SYSCTL_NODE(_debug_lock, OID_AUTO, delay, CTLFLAG_RD, NULL, "lock delay"); static u_int __read_mostly starvation_limit = 131072; SYSCTL_INT(_debug_lock_delay, OID_AUTO, starvation_limit, CTLFLAG_RW, &starvation_limit, 0, ""); static u_int __read_mostly restrict_starvation = 0; SYSCTL_INT(_debug_lock_delay, OID_AUTO, restrict_starvation, CTLFLAG_RW, &restrict_starvation, 0, ""); void lock_delay(struct lock_delay_arg *la) { struct lock_delay_config *lc = la->config; u_int i; la->delay <<= 1; if (__predict_false(la->delay > lc->max)) la->delay = lc->max; for (i = la->delay; i > 0; i--) cpu_spinwait(); la->spin_cnt += la->delay; if (__predict_false(la->spin_cnt > starvation_limit)) { SDT_PROBE1(lock, , , starvation, la->delay); if (restrict_starvation) la->delay = lc->base; } } static u_int lock_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } void lock_delay_default_init(struct lock_delay_config *lc) { lc->base = lock_roundup_2(mp_ncpus) / 4; lc->max = lc->base * 1024; } #ifdef DDB DB_SHOW_COMMAND(lock, db_show_lock) { struct lock_object *lock; struct lock_class *class; if (!have_addr) return; lock = (struct lock_object *)addr; if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) { db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock)); return; } class = LOCK_CLASS(lock); db_printf(" class: %s\n", class->lc_name); db_printf(" name: %s\n", lock->lo_name); class->lc_ddb_show(lock); } #endif #ifdef LOCK_PROFILING /* * One object per-thread for each lock the thread owns. Tracks individual * lock instances. */ struct lock_profile_object { LIST_ENTRY(lock_profile_object) lpo_link; struct lock_object *lpo_obj; const char *lpo_file; int lpo_line; uint16_t lpo_ref; uint16_t lpo_cnt; uint64_t lpo_acqtime; uint64_t lpo_waittime; u_int lpo_contest_locking; }; /* * One lock_prof for each (file, line, lock object) triple. */ struct lock_prof { SLIST_ENTRY(lock_prof) link; struct lock_class *class; const char *file; const char *name; int line; int ticks; uintmax_t cnt_wait_max; uintmax_t cnt_max; uintmax_t cnt_tot; uintmax_t cnt_wait; uintmax_t cnt_cur; uintmax_t cnt_contest_locking; }; SLIST_HEAD(lphead, lock_prof); #define LPROF_HASH_SIZE 4096 #define LPROF_HASH_MASK (LPROF_HASH_SIZE - 1) #define LPROF_CACHE_SIZE 4096 /* * Array of objects and profs for each type of object for each cpu. Spinlocks * are handled separately because a thread may be preempted and acquire a * spinlock while in the lock profiling code of a non-spinlock. In this way * we only need a critical section to protect the per-cpu lists. */ struct lock_prof_type { struct lphead lpt_lpalloc; struct lpohead lpt_lpoalloc; struct lphead lpt_hash[LPROF_HASH_SIZE]; struct lock_prof lpt_prof[LPROF_CACHE_SIZE]; struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE]; }; struct lock_prof_cpu { struct lock_prof_type lpc_types[2]; /* One for spin one for other. */ }; struct lock_prof_cpu *lp_cpu[MAXCPU]; volatile int __read_mostly lock_prof_enable; static volatile int lock_prof_resetting; #define LPROF_SBUF_SIZE 256 static int lock_prof_rejected; static int lock_prof_skipspin; static int lock_prof_skipcount; #ifndef USE_CPU_NANOSECONDS uint64_t nanoseconds(void) { struct bintime bt; uint64_t ns; binuptime(&bt); /* From bintime2timespec */ ns = bt.sec * (uint64_t)1000000000; ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32; return (ns); } #endif static void lock_prof_init_type(struct lock_prof_type *type) { int i; SLIST_INIT(&type->lpt_lpalloc); LIST_INIT(&type->lpt_lpoalloc); for (i = 0; i < LPROF_CACHE_SIZE; i++) { SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i], link); LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i], lpo_link); } } static void lock_prof_init(void *arg) { int cpu; for (cpu = 0; cpu <= mp_maxid; cpu++) { lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF, M_WAITOK | M_ZERO); lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]); lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]); } } SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL); static void lock_prof_reset_wait(void) { /* * Spin relinquishing our cpu so that quiesce_all_cpus may * complete. */ while (lock_prof_resetting) sched_relinquish(curthread); } static void lock_prof_reset(void) { struct lock_prof_cpu *lpc; int enabled, i, cpu; /* * We not only race with acquiring and releasing locks but also * thread exit. To be certain that threads exit without valid head * pointers they must see resetting set before enabled is cleared. * Otherwise a lock may not be removed from a per-thread list due * to disabled being set but not wait for reset() to remove it below. */ atomic_store_rel_int(&lock_prof_resetting, 1); enabled = lock_prof_enable; lock_prof_enable = 0; quiesce_all_cpus("profreset", 0); /* * Some objects may have migrated between CPUs. Clear all links * before we zero the structures. Some items may still be linked * into per-thread lists as well. */ for (cpu = 0; cpu <= mp_maxid; cpu++) { lpc = lp_cpu[cpu]; for (i = 0; i < LPROF_CACHE_SIZE; i++) { LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link); LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link); } } for (cpu = 0; cpu <= mp_maxid; cpu++) { lpc = lp_cpu[cpu]; bzero(lpc, sizeof(*lpc)); lock_prof_init_type(&lpc->lpc_types[0]); lock_prof_init_type(&lpc->lpc_types[1]); } atomic_store_rel_int(&lock_prof_resetting, 0); lock_prof_enable = enabled; } static void lock_prof_output(struct lock_prof *lp, struct sbuf *sb) { const char *p; for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3); sbuf_printf(sb, "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n", lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000, lp->cnt_wait / 1000, lp->cnt_cur, lp->cnt_cur == 0 ? (uintmax_t)0 : lp->cnt_tot / (lp->cnt_cur * 1000), lp->cnt_cur == 0 ? (uintmax_t)0 : lp->cnt_wait / (lp->cnt_cur * 1000), (uintmax_t)0, lp->cnt_contest_locking, p, lp->line, lp->class->lc_name, lp->name); } static void lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, int spin, int t) { struct lock_prof_type *type; struct lock_prof *l; int cpu; dst->file = match->file; dst->line = match->line; dst->class = match->class; dst->name = match->name; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (lp_cpu[cpu] == NULL) continue; type = &lp_cpu[cpu]->lpc_types[spin]; SLIST_FOREACH(l, &type->lpt_hash[hash], link) { if (l->ticks == t) continue; if (l->file != match->file || l->line != match->line || l->name != match->name) continue; l->ticks = t; if (l->cnt_max > dst->cnt_max) dst->cnt_max = l->cnt_max; if (l->cnt_wait_max > dst->cnt_wait_max) dst->cnt_wait_max = l->cnt_wait_max; dst->cnt_tot += l->cnt_tot; dst->cnt_wait += l->cnt_wait; dst->cnt_cur += l->cnt_cur; dst->cnt_contest_locking += l->cnt_contest_locking; } } } static void lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin, int t) { struct lock_prof *l; int i; for (i = 0; i < LPROF_HASH_SIZE; ++i) { SLIST_FOREACH(l, &type->lpt_hash[i], link) { struct lock_prof lp = {}; if (l->ticks == t) continue; lock_prof_sum(l, &lp, i, spin, t); lock_prof_output(&lp, sb); } } } static int dump_lock_prof_stats(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error, cpu, t; int enabled; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req); sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n", "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name"); enabled = lock_prof_enable; lock_prof_enable = 0; quiesce_all_cpus("profstat", 0); t = ticks; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (lp_cpu[cpu] == NULL) continue; lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t); lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t); } lock_prof_enable = enabled; error = sbuf_finish(sb); /* Output a trailing NUL. */ if (error == 0) error = SYSCTL_OUT(req, "", 1); sbuf_delete(sb); return (error); } static int enable_lock_prof(SYSCTL_HANDLER_ARGS) { int error, v; v = lock_prof_enable; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == lock_prof_enable) return (0); if (v == 1) lock_prof_reset(); lock_prof_enable = !!v; return (0); } static int reset_lock_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); lock_prof_reset(); return (0); } static struct lock_prof * lock_profile_lookup(struct lock_object *lo, int spin, const char *file, int line) { const char *unknown = "(unknown)"; struct lock_prof_type *type; struct lock_prof *lp; struct lphead *head; const char *p; u_int hash; p = file; if (p == NULL || *p == '\0') p = unknown; hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line; hash &= LPROF_HASH_MASK; type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; head = &type->lpt_hash[hash]; SLIST_FOREACH(lp, head, link) { if (lp->line == line && lp->file == p && lp->name == lo->lo_name) return (lp); } lp = SLIST_FIRST(&type->lpt_lpalloc); if (lp == NULL) { lock_prof_rejected++; return (lp); } SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link); lp->file = p; lp->line = line; lp->class = LOCK_CLASS(lo); lp->name = lo->lo_name; SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link); return (lp); } static struct lock_profile_object * lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file, int line) { struct lock_profile_object *l; struct lock_prof_type *type; struct lpohead *head; head = &curthread->td_lprof[spin]; LIST_FOREACH(l, head, lpo_link) if (l->lpo_obj == lo && l->lpo_file == file && l->lpo_line == line) return (l); type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; l = LIST_FIRST(&type->lpt_lpoalloc); if (l == NULL) { lock_prof_rejected++; return (NULL); } LIST_REMOVE(l, lpo_link); l->lpo_obj = lo; l->lpo_file = file; l->lpo_line = line; l->lpo_cnt = 0; LIST_INSERT_HEAD(head, l, lpo_link); return (l); } void lock_profile_obtain_lock_success(struct lock_object *lo, int contested, uint64_t waittime, const char *file, int line) { static int lock_prof_count; struct lock_profile_object *l; int spin; if (SCHEDULER_STOPPED()) return; /* don't reset the timer when/if recursing */ if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE)) return; if (lock_prof_skipcount && (++lock_prof_count % lock_prof_skipcount) != 0) return; spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0; if (spin && lock_prof_skipspin == 1) return; critical_enter(); /* Recheck enabled now that we're in a critical section. */ if (lock_prof_enable == 0) goto out; l = lock_profile_object_lookup(lo, spin, file, line); if (l == NULL) goto out; l->lpo_cnt++; if (++l->lpo_ref > 1) goto out; l->lpo_contest_locking = contested; l->lpo_acqtime = nanoseconds(); if (waittime && (l->lpo_acqtime > waittime)) l->lpo_waittime = l->lpo_acqtime - waittime; else l->lpo_waittime = 0; out: critical_exit(); } void lock_profile_thread_exit(struct thread *td) { #ifdef INVARIANTS struct lock_profile_object *l; MPASS(curthread->td_critnest == 0); #endif /* * If lock profiling was disabled we have to wait for reset to * clear our pointers before we can exit safely. */ lock_prof_reset_wait(); #ifdef INVARIANTS LIST_FOREACH(l, &td->td_lprof[0], lpo_link) printf("thread still holds lock acquired at %s:%d\n", l->lpo_file, l->lpo_line); LIST_FOREACH(l, &td->td_lprof[1], lpo_link) printf("thread still holds lock acquired at %s:%d\n", l->lpo_file, l->lpo_line); #endif MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL); MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL); } void lock_profile_release_lock(struct lock_object *lo) { struct lock_profile_object *l; struct lock_prof_type *type; struct lock_prof *lp; uint64_t curtime, holdtime; struct lpohead *head; int spin; if (SCHEDULER_STOPPED()) return; if (lo->lo_flags & LO_NOPROFILE) return; spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0; head = &curthread->td_lprof[spin]; if (LIST_FIRST(head) == NULL) return; critical_enter(); /* Recheck enabled now that we're in a critical section. */ if (lock_prof_enable == 0 && lock_prof_resetting == 1) goto out; /* * If lock profiling is not enabled we still want to remove the * lpo from our queue. */ LIST_FOREACH(l, head, lpo_link) if (l->lpo_obj == lo) break; if (l == NULL) goto out; if (--l->lpo_ref > 0) goto out; lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line); if (lp == NULL) goto release; curtime = nanoseconds(); if (curtime < l->lpo_acqtime) goto release; holdtime = curtime - l->lpo_acqtime; /* * Record if the lock has been held longer now than ever * before. */ if (holdtime > lp->cnt_max) lp->cnt_max = holdtime; if (l->lpo_waittime > lp->cnt_wait_max) lp->cnt_wait_max = l->lpo_waittime; lp->cnt_tot += holdtime; lp->cnt_wait += l->lpo_waittime; lp->cnt_contest_locking += l->lpo_contest_locking; lp->cnt_cur += l->lpo_cnt; release: LIST_REMOVE(l, lpo_link); type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link); out: critical_exit(); } static SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL, "lock profiling"); SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW, &lock_prof_skipspin, 0, "Skip profiling on spinlocks."); SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW, &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions."); SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD, &lock_prof_rejected, 0, "Number of rejected profiling records"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, enable_lock_prof, "I", "Enable lock profiling"); #endif Index: head/sys/kern/subr_module.c =================================================================== --- head/sys/kern/subr_module.c (revision 326270) +++ head/sys/kern/subr_module.c (revision 326271) @@ -1,293 +1,295 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1998 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include /* * Preloaded module support */ vm_offset_t preload_addr_relocate = 0; caddr_t preload_metadata; /* * Search for the preloaded module (name) */ caddr_t preload_search_by_name(const char *name) { caddr_t curp; uint32_t *hdr; int next; if (preload_metadata != NULL) { curp = preload_metadata; for (;;) { hdr = (uint32_t *)curp; if (hdr[0] == 0 && hdr[1] == 0) break; /* Search for a MODINFO_NAME field */ if ((hdr[0] == MODINFO_NAME) && !strcmp(name, curp + sizeof(uint32_t) * 2)) return(curp); /* skip to next field */ next = sizeof(uint32_t) * 2 + hdr[1]; next = roundup(next, sizeof(u_long)); curp += next; } } return(NULL); } /* * Search for the first preloaded module of (type) */ caddr_t preload_search_by_type(const char *type) { caddr_t curp, lname; uint32_t *hdr; int next; if (preload_metadata != NULL) { curp = preload_metadata; lname = NULL; for (;;) { hdr = (uint32_t *)curp; if (hdr[0] == 0 && hdr[1] == 0) break; /* remember the start of each record */ if (hdr[0] == MODINFO_NAME) lname = curp; /* Search for a MODINFO_TYPE field */ if ((hdr[0] == MODINFO_TYPE) && !strcmp(type, curp + sizeof(uint32_t) * 2)) return(lname); /* skip to next field */ next = sizeof(uint32_t) * 2 + hdr[1]; next = roundup(next, sizeof(u_long)); curp += next; } } return(NULL); } /* * Walk through the preloaded module list */ caddr_t preload_search_next_name(caddr_t base) { caddr_t curp; uint32_t *hdr; int next; if (preload_metadata != NULL) { /* Pick up where we left off last time */ if (base) { /* skip to next field */ curp = base; hdr = (uint32_t *)curp; next = sizeof(uint32_t) * 2 + hdr[1]; next = roundup(next, sizeof(u_long)); curp += next; } else curp = preload_metadata; for (;;) { hdr = (uint32_t *)curp; if (hdr[0] == 0 && hdr[1] == 0) break; /* Found a new record? */ if (hdr[0] == MODINFO_NAME) return curp; /* skip to next field */ next = sizeof(uint32_t) * 2 + hdr[1]; next = roundup(next, sizeof(u_long)); curp += next; } } return(NULL); } /* * Given a preloaded module handle (mod), return a pointer * to the data for the attribute (inf). */ caddr_t preload_search_info(caddr_t mod, int inf) { caddr_t curp; uint32_t *hdr; uint32_t type = 0; int next; if (mod == NULL) return (NULL); curp = mod; for (;;) { hdr = (uint32_t *)curp; /* end of module data? */ if (hdr[0] == 0 && hdr[1] == 0) break; /* * We give up once we've looped back to what we were looking at * first - this should normally be a MODINFO_NAME field. */ if (type == 0) { type = hdr[0]; } else { if (hdr[0] == type) break; } /* * Attribute match? Return pointer to data. * Consumer may safely assume that size value precedes * data. */ if (hdr[0] == inf) return(curp + (sizeof(uint32_t) * 2)); /* skip to next field */ next = sizeof(uint32_t) * 2 + hdr[1]; next = roundup(next, sizeof(u_long)); curp += next; } return(NULL); } /* * Delete a preload record by name. */ void preload_delete_name(const char *name) { caddr_t curp; uint32_t *hdr; int next; int clearing; if (preload_metadata != NULL) { clearing = 0; curp = preload_metadata; for (;;) { hdr = (uint32_t *)curp; if (hdr[0] == 0 && hdr[1] == 0) break; /* Search for a MODINFO_NAME field */ if (hdr[0] == MODINFO_NAME) { if (!strcmp(name, curp + sizeof(uint32_t) * 2)) clearing = 1; /* got it, start clearing */ else if (clearing) clearing = 0; /* at next one now.. better stop */ } if (clearing) hdr[0] = MODINFO_EMPTY; /* skip to next field */ next = sizeof(uint32_t) * 2 + hdr[1]; next = roundup(next, sizeof(u_long)); curp += next; } } } void * preload_fetch_addr(caddr_t mod) { caddr_t *mdp; mdp = (caddr_t *)preload_search_info(mod, MODINFO_ADDR); if (mdp == NULL) return (NULL); return (*mdp + preload_addr_relocate); } size_t preload_fetch_size(caddr_t mod) { size_t *mdp; mdp = (size_t *)preload_search_info(mod, MODINFO_SIZE); if (mdp == NULL) return (0); return (*mdp); } /* Called from locore. Convert physical pointers to kvm. Sigh. */ void preload_bootstrap_relocate(vm_offset_t offset) { caddr_t curp; uint32_t *hdr; vm_offset_t *ptr; int next; if (preload_metadata != NULL) { curp = preload_metadata; for (;;) { hdr = (uint32_t *)curp; if (hdr[0] == 0 && hdr[1] == 0) break; /* Deal with the ones that we know we have to fix */ switch (hdr[0]) { case MODINFO_ADDR: case MODINFO_METADATA|MODINFOMD_SSYM: case MODINFO_METADATA|MODINFOMD_ESYM: ptr = (vm_offset_t *)(curp + (sizeof(uint32_t) * 2)); *ptr += offset; break; } /* The rest is beyond us for now */ /* skip to next field */ next = sizeof(uint32_t) * 2 + hdr[1]; next = roundup(next, sizeof(u_long)); curp += next; } } } Index: head/sys/kern/subr_msgbuf.c =================================================================== --- head/sys/kern/subr_msgbuf.c (revision 326270) +++ head/sys/kern/subr_msgbuf.c (revision 326271) @@ -1,417 +1,419 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003 Ian Dowse. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Generic message buffer support routines. */ #include #include #include #include #include #include #include /* * Maximum number conversion buffer length: uintmax_t in base 2, plus <> * around the priority, and a terminating NUL. */ #define MAXPRIBUF (sizeof(intmax_t) * NBBY + 3) /* Read/write sequence numbers are modulo a multiple of the buffer size. */ #define SEQMOD(size) ((size) * 16) static u_int msgbuf_cksum(struct msgbuf *mbp); /* * Timestamps in msgbuf are useful when trying to diagnose when core dumps * or other actions occurred. */ static int msgbuf_show_timestamp = 0; SYSCTL_INT(_kern, OID_AUTO, msgbuf_show_timestamp, CTLFLAG_RWTUN, &msgbuf_show_timestamp, 0, "Show timestamp in msgbuf"); /* * Initialize a message buffer of the specified size at the specified * location. This also zeros the buffer area. */ void msgbuf_init(struct msgbuf *mbp, void *ptr, int size) { mbp->msg_ptr = ptr; mbp->msg_size = size; mbp->msg_seqmod = SEQMOD(size); msgbuf_clear(mbp); mbp->msg_magic = MSG_MAGIC; mbp->msg_lastpri = -1; mbp->msg_flags = 0; bzero(&mbp->msg_lock, sizeof(mbp->msg_lock)); mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN); } /* * Reinitialize a message buffer, retaining its previous contents if * the size and checksum are correct. If the old contents cannot be * recovered, the message buffer is cleared. */ void msgbuf_reinit(struct msgbuf *mbp, void *ptr, int size) { u_int cksum; if (mbp->msg_magic != MSG_MAGIC || mbp->msg_size != size) { msgbuf_init(mbp, ptr, size); return; } mbp->msg_seqmod = SEQMOD(size); mbp->msg_wseq = MSGBUF_SEQNORM(mbp, mbp->msg_wseq); mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq); mbp->msg_ptr = ptr; cksum = msgbuf_cksum(mbp); if (cksum != mbp->msg_cksum) { if (bootverbose) { printf("msgbuf cksum mismatch (read %x, calc %x)\n", mbp->msg_cksum, cksum); printf("Old msgbuf not recovered\n"); } msgbuf_clear(mbp); } mbp->msg_lastpri = -1; /* Assume that the old message buffer didn't end in a newline. */ mbp->msg_flags |= MSGBUF_NEEDNL; bzero(&mbp->msg_lock, sizeof(mbp->msg_lock)); mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN); } /* * Clear the message buffer. */ void msgbuf_clear(struct msgbuf *mbp) { bzero(mbp->msg_ptr, mbp->msg_size); mbp->msg_wseq = 0; mbp->msg_rseq = 0; mbp->msg_cksum = 0; } /* * Get a count of the number of unread characters in the message buffer. */ int msgbuf_getcount(struct msgbuf *mbp) { u_int len; len = MSGBUF_SEQSUB(mbp, mbp->msg_wseq, mbp->msg_rseq); if (len > mbp->msg_size) len = mbp->msg_size; return (len); } /* * Add a character into the message buffer, and update the checksum and * sequence number. * * The caller should hold the message buffer spinlock. */ static void msgbuf_do_addchar(struct msgbuf * const mbp, u_int * const seq, const int c) { u_int pos; /* Make sure we properly wrap the sequence number. */ pos = MSGBUF_SEQ_TO_POS(mbp, *seq); mbp->msg_cksum += (u_int)(u_char)c - (u_int)(u_char)mbp->msg_ptr[pos]; mbp->msg_ptr[pos] = c; *seq = MSGBUF_SEQNORM(mbp, *seq + 1); } /* * Append a character to a message buffer. */ void msgbuf_addchar(struct msgbuf *mbp, int c) { mtx_lock_spin(&mbp->msg_lock); msgbuf_do_addchar(mbp, &mbp->msg_wseq, c); mtx_unlock_spin(&mbp->msg_lock); } /* * Append a NUL-terminated string with a priority to a message buffer. * Filter carriage returns if the caller requests it. * * XXX The carriage return filtering behavior is present in the * msglogchar() API, however testing has shown that we don't seem to send * carriage returns down this path. So do we still need it? */ void msgbuf_addstr(struct msgbuf *mbp, int pri, char *str, int filter_cr) { u_int seq; size_t len, prefix_len; char prefix[MAXPRIBUF]; char buf[32]; int nl, i, j, needtime; len = strlen(str); prefix_len = 0; nl = 0; /* If we have a zero-length string, no need to do anything. */ if (len == 0) return; mtx_lock_spin(&mbp->msg_lock); /* * If this is true, we may need to insert a new priority sequence, * so prepare the prefix. */ if (pri != -1) prefix_len = sprintf(prefix, "<%d>", pri); /* * Starting write sequence number. */ seq = mbp->msg_wseq; /* * Whenever there is a change in priority, we have to insert a * newline, and a priority prefix if the priority is not -1. Here * we detect whether there was a priority change, and whether we * did not end with a newline. If that is the case, we need to * insert a newline before this string. */ if (mbp->msg_lastpri != pri && (mbp->msg_flags & MSGBUF_NEEDNL) != 0) { msgbuf_do_addchar(mbp, &seq, '\n'); mbp->msg_flags &= ~MSGBUF_NEEDNL; } needtime = 1; for (i = 0; i < len; i++) { /* * If we just had a newline, and the priority is not -1 * (and therefore prefix_len != 0), then we need a priority * prefix for this line. */ if ((mbp->msg_flags & MSGBUF_NEEDNL) == 0 && prefix_len != 0) { int j; for (j = 0; j < prefix_len; j++) msgbuf_do_addchar(mbp, &seq, prefix[j]); } if (msgbuf_show_timestamp && needtime == 1 && (mbp->msg_flags & MSGBUF_NEEDNL) == 0) { snprintf(buf, sizeof(buf), "[%jd] ", (intmax_t)time_uptime); for (j = 0; buf[j] != '\0'; j++) msgbuf_do_addchar(mbp, &seq, buf[j]); needtime = 0; } /* * Don't copy carriage returns if the caller requested * filtering. * * XXX This matches the behavior of msglogchar(), but is it * necessary? Testing has shown that we don't seem to get * carriage returns here. */ if ((filter_cr != 0) && (str[i] == '\r')) continue; /* * Clear this flag if we see a newline. This affects whether * we need to insert a new prefix or insert a newline later. */ if (str[i] == '\n') mbp->msg_flags &= ~MSGBUF_NEEDNL; else mbp->msg_flags |= MSGBUF_NEEDNL; msgbuf_do_addchar(mbp, &seq, str[i]); } /* * Update the write sequence number for the actual number of * characters we put in the message buffer. (Depends on whether * carriage returns are filtered.) */ mbp->msg_wseq = seq; /* * Set the last priority. */ mbp->msg_lastpri = pri; mtx_unlock_spin(&mbp->msg_lock); } /* * Read and mark as read a character from a message buffer. * Returns the character, or -1 if no characters are available. */ int msgbuf_getchar(struct msgbuf *mbp) { u_int len, wseq; int c; mtx_lock_spin(&mbp->msg_lock); wseq = mbp->msg_wseq; len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq); if (len == 0) { mtx_unlock_spin(&mbp->msg_lock); return (-1); } if (len > mbp->msg_size) mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size); c = (u_char)mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq)]; mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + 1); mtx_unlock_spin(&mbp->msg_lock); return (c); } /* * Read and mark as read a number of characters from a message buffer. * Returns the number of characters that were placed in `buf'. */ int msgbuf_getbytes(struct msgbuf *mbp, char *buf, int buflen) { u_int len, pos, wseq; mtx_lock_spin(&mbp->msg_lock); wseq = mbp->msg_wseq; len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq); if (len == 0) { mtx_unlock_spin(&mbp->msg_lock); return (0); } if (len > mbp->msg_size) { mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size); len = mbp->msg_size; } pos = MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq); len = min(len, mbp->msg_size - pos); len = min(len, (u_int)buflen); bcopy(&mbp->msg_ptr[pos], buf, len); mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + len); mtx_unlock_spin(&mbp->msg_lock); return (len); } /* * Peek at the full contents of a message buffer without marking any * data as read. `seqp' should point to an unsigned integer that * msgbuf_peekbytes() can use to retain state between calls so that * the whole message buffer can be read in multiple short reads. * To initialise this variable to the start of the message buffer, * call msgbuf_peekbytes() with a NULL `buf' parameter. * * Returns the number of characters that were placed in `buf'. */ int msgbuf_peekbytes(struct msgbuf *mbp, char *buf, int buflen, u_int *seqp) { u_int len, pos, wseq; mtx_lock_spin(&mbp->msg_lock); if (buf == NULL) { /* Just initialise *seqp. */ *seqp = MSGBUF_SEQNORM(mbp, mbp->msg_wseq - mbp->msg_size); mtx_unlock_spin(&mbp->msg_lock); return (0); } wseq = mbp->msg_wseq; len = MSGBUF_SEQSUB(mbp, wseq, *seqp); if (len == 0) { mtx_unlock_spin(&mbp->msg_lock); return (0); } if (len > mbp->msg_size) { *seqp = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size); len = mbp->msg_size; } pos = MSGBUF_SEQ_TO_POS(mbp, *seqp); len = min(len, mbp->msg_size - pos); len = min(len, (u_int)buflen); bcopy(&mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, *seqp)], buf, len); *seqp = MSGBUF_SEQNORM(mbp, *seqp + len); mtx_unlock_spin(&mbp->msg_lock); return (len); } /* * Compute the checksum for the complete message buffer contents. */ static u_int msgbuf_cksum(struct msgbuf *mbp) { u_int i, sum; sum = 0; for (i = 0; i < mbp->msg_size; i++) sum += (u_char)mbp->msg_ptr[i]; return (sum); } /* * Copy from one message buffer to another. */ void msgbuf_copy(struct msgbuf *src, struct msgbuf *dst) { int c; while ((c = msgbuf_getchar(src)) >= 0) msgbuf_addchar(dst, c); } Index: head/sys/kern/subr_pctrie.c =================================================================== --- head/sys/kern/subr_pctrie.c (revision 326270) +++ head/sys/kern/subr_pctrie.c (revision 326271) @@ -1,693 +1,695 @@ -/* +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Path-compressed radix trie implementation. * * The implementation takes into account the following rationale: * - Size of the nodes should be as small as possible but still big enough * to avoid a large maximum depth for the trie. This is a balance * between the necessity to not wire too much physical memory for the nodes * and the necessity to avoid too much cache pollution during the trie * operations. * - There is not a huge bias toward the number of lookup operations over * the number of insert and remove operations. This basically implies * that optimizations supposedly helping one operation but hurting the * other might be carefully evaluated. * - On average not many nodes are expected to be fully populated, hence * level compression may just complicate things. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #ifdef DDB #include #endif #define PCTRIE_MASK (PCTRIE_COUNT - 1) #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) /* Flag bits stored in node pointers. */ #define PCTRIE_ISLEAF 0x1 #define PCTRIE_FLAGS 0x1 #define PCTRIE_PAD PCTRIE_FLAGS /* Returns one unit associated with specified level. */ #define PCTRIE_UNITLEVEL(lev) \ ((uint64_t)1 << ((lev) * PCTRIE_WIDTH)) struct pctrie_node { uint64_t pn_owner; /* Owner of record. */ uint16_t pn_count; /* Valid children. */ uint16_t pn_clev; /* Current level. */ void *pn_child[PCTRIE_COUNT]; /* Child nodes. */ }; /* * Allocate a node. Pre-allocation should ensure that the request * will always be satisfied. */ static __inline struct pctrie_node * pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner, uint16_t count, uint16_t clevel) { struct pctrie_node *node; node = allocfn(ptree); if (node == NULL) return (NULL); node->pn_owner = owner; node->pn_count = count; node->pn_clev = clevel; return (node); } /* * Free radix node. */ static __inline void pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node, pctrie_free_t freefn) { #ifdef INVARIANTS int slot; KASSERT(node->pn_count == 0, ("pctrie_node_put: node %p has %d children", node, node->pn_count)); for (slot = 0; slot < PCTRIE_COUNT; slot++) KASSERT(node->pn_child[slot] == NULL, ("pctrie_node_put: node %p has a child", node)); #endif freefn(ptree, node); } /* * Return the position in the array for a given level. */ static __inline int pctrie_slot(uint64_t index, uint16_t level) { return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK); } /* Trims the key after the specified level. */ static __inline uint64_t pctrie_trimkey(uint64_t index, uint16_t level) { uint64_t ret; ret = index; if (level > 0) { ret >>= level * PCTRIE_WIDTH; ret <<= level * PCTRIE_WIDTH; } return (ret); } /* * Get the root node for a tree. */ static __inline struct pctrie_node * pctrie_getroot(struct pctrie *ptree) { return ((struct pctrie_node *)ptree->pt_root); } /* * Set the root node for a tree. */ static __inline void pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node) { ptree->pt_root = (uintptr_t)node; } /* * Returns TRUE if the specified node is a leaf and FALSE otherwise. */ static __inline boolean_t pctrie_isleaf(struct pctrie_node *node) { return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); } /* * Returns the associated val extracted from node. */ static __inline uint64_t * pctrie_toval(struct pctrie_node *node) { return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); } /* * Adds the val as a child of the provided node. */ static __inline void pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev, uint64_t *val) { int slot; slot = pctrie_slot(index, clev); node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF); } /* * Returns the slot where two keys differ. * It cannot accept 2 equal keys. */ static __inline uint16_t pctrie_keydiff(uint64_t index1, uint64_t index2) { uint16_t clev; KASSERT(index1 != index2, ("%s: passing the same key value %jx", __func__, (uintmax_t)index1)); index1 ^= index2; for (clev = PCTRIE_LIMIT;; clev--) if (pctrie_slot(index1, clev) != 0) return (clev); } /* * Returns TRUE if it can be determined that key does not belong to the * specified node. Otherwise, returns FALSE. */ static __inline boolean_t pctrie_keybarr(struct pctrie_node *node, uint64_t idx) { if (node->pn_clev < PCTRIE_LIMIT) { idx = pctrie_trimkey(idx, node->pn_clev + 1); return (idx != node->pn_owner); } return (FALSE); } /* * Internal helper for pctrie_reclaim_allnodes(). * This function is recursive. */ static void pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node, pctrie_free_t freefn) { int slot; KASSERT(node->pn_count <= PCTRIE_COUNT, ("pctrie_reclaim_allnodes_int: bad count in node %p", node)); for (slot = 0; node->pn_count != 0; slot++) { if (node->pn_child[slot] == NULL) continue; if (!pctrie_isleaf(node->pn_child[slot])) pctrie_reclaim_allnodes_int(ptree, node->pn_child[slot], freefn); node->pn_child[slot] = NULL; node->pn_count--; } pctrie_node_put(ptree, node, freefn); } /* * pctrie node zone initializer. */ int pctrie_zone_init(void *mem, int size __unused, int flags __unused) { struct pctrie_node *node; node = mem; memset(node->pn_child, 0, sizeof(node->pn_child)); return (0); } size_t pctrie_node_size(void) { return (sizeof(struct pctrie_node)); } /* * Inserts the key-value pair into the trie. * Panics if the key already exists. */ int pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn) { uint64_t index, newind; void **parentp; struct pctrie_node *node, *tmp; uint64_t *m; int slot; uint16_t clev; index = *val; /* * The owner of record for root is not really important because it * will never be used. */ node = pctrie_getroot(ptree); if (node == NULL) { ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF; return (0); } parentp = (void **)&ptree->pt_root; for (;;) { if (pctrie_isleaf(node)) { m = pctrie_toval(node); if (*m == index) panic("%s: key %jx is already present", __func__, (uintmax_t)index); clev = pctrie_keydiff(*m, index); tmp = pctrie_node_get(ptree, allocfn, pctrie_trimkey(index, clev + 1), 2, clev); if (tmp == NULL) return (ENOMEM); *parentp = tmp; pctrie_addval(tmp, index, clev, val); pctrie_addval(tmp, *m, clev, m); return (0); } else if (pctrie_keybarr(node, index)) break; slot = pctrie_slot(index, node->pn_clev); if (node->pn_child[slot] == NULL) { node->pn_count++; pctrie_addval(node, index, node->pn_clev, val); return (0); } parentp = &node->pn_child[slot]; node = node->pn_child[slot]; } /* * A new node is needed because the right insertion level is reached. * Setup the new intermediate node and add the 2 children: the * new object and the older edge. */ newind = node->pn_owner; clev = pctrie_keydiff(newind, index); tmp = pctrie_node_get(ptree, allocfn, pctrie_trimkey(index, clev + 1), 2, clev); if (tmp == NULL) return (ENOMEM); *parentp = tmp; pctrie_addval(tmp, index, clev, val); slot = pctrie_slot(newind, clev); tmp->pn_child[slot] = node; return (0); } /* * Returns the value stored at the index. If the index is not present, * NULL is returned. */ uint64_t * pctrie_lookup(struct pctrie *ptree, uint64_t index) { struct pctrie_node *node; uint64_t *m; int slot; node = pctrie_getroot(ptree); while (node != NULL) { if (pctrie_isleaf(node)) { m = pctrie_toval(node); if (*m == index) return (m); else break; } else if (pctrie_keybarr(node, index)) break; slot = pctrie_slot(index, node->pn_clev); node = node->pn_child[slot]; } return (NULL); } /* * Look up the nearest entry at a position bigger than or equal to index. */ uint64_t * pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) { struct pctrie_node *stack[PCTRIE_LIMIT]; uint64_t inc; uint64_t *m; struct pctrie_node *child, *node; #ifdef INVARIANTS int loops = 0; #endif int slot, tos; node = pctrie_getroot(ptree); if (node == NULL) return (NULL); else if (pctrie_isleaf(node)) { m = pctrie_toval(node); if (*m >= index) return (m); else return (NULL); } tos = 0; for (;;) { /* * If the keys differ before the current bisection node, * then the search key might rollback to the earliest * available bisection node or to the smallest key * in the current node (if the owner is bigger than the * search key). */ if (pctrie_keybarr(node, index)) { if (index > node->pn_owner) { ascend: KASSERT(++loops < 1000, ("pctrie_lookup_ge: too many loops")); /* * Pop nodes from the stack until either the * stack is empty or a node that could have a * matching descendant is found. */ do { if (tos == 0) return (NULL); node = stack[--tos]; } while (pctrie_slot(index, node->pn_clev) == (PCTRIE_COUNT - 1)); /* * The following computation cannot overflow * because index's slot at the current level * is less than PCTRIE_COUNT - 1. */ index = pctrie_trimkey(index, node->pn_clev); index += PCTRIE_UNITLEVEL(node->pn_clev); } else index = node->pn_owner; KASSERT(!pctrie_keybarr(node, index), ("pctrie_lookup_ge: keybarr failed")); } slot = pctrie_slot(index, node->pn_clev); child = node->pn_child[slot]; if (pctrie_isleaf(child)) { m = pctrie_toval(child); if (*m >= index) return (m); } else if (child != NULL) goto descend; /* * Look for an available edge or val within the current * bisection node. */ if (slot < (PCTRIE_COUNT - 1)) { inc = PCTRIE_UNITLEVEL(node->pn_clev); index = pctrie_trimkey(index, node->pn_clev); do { index += inc; slot++; child = node->pn_child[slot]; if (pctrie_isleaf(child)) { m = pctrie_toval(child); if (*m >= index) return (m); } else if (child != NULL) goto descend; } while (slot < (PCTRIE_COUNT - 1)); } KASSERT(child == NULL || pctrie_isleaf(child), ("pctrie_lookup_ge: child is radix node")); /* * If a value or edge bigger than the search slot is not found * in the current node, ascend to the next higher-level node. */ goto ascend; descend: KASSERT(node->pn_clev > 0, ("pctrie_lookup_ge: pushing leaf's parent")); KASSERT(tos < PCTRIE_LIMIT, ("pctrie_lookup_ge: stack overflow")); stack[tos++] = node; node = child; } } /* * Look up the nearest entry at a position less than or equal to index. */ uint64_t * pctrie_lookup_le(struct pctrie *ptree, uint64_t index) { struct pctrie_node *stack[PCTRIE_LIMIT]; uint64_t inc; uint64_t *m; struct pctrie_node *child, *node; #ifdef INVARIANTS int loops = 0; #endif int slot, tos; node = pctrie_getroot(ptree); if (node == NULL) return (NULL); else if (pctrie_isleaf(node)) { m = pctrie_toval(node); if (*m <= index) return (m); else return (NULL); } tos = 0; for (;;) { /* * If the keys differ before the current bisection node, * then the search key might rollback to the earliest * available bisection node or to the largest key * in the current node (if the owner is smaller than the * search key). */ if (pctrie_keybarr(node, index)) { if (index > node->pn_owner) { index = node->pn_owner + PCTRIE_COUNT * PCTRIE_UNITLEVEL(node->pn_clev); } else { ascend: KASSERT(++loops < 1000, ("pctrie_lookup_le: too many loops")); /* * Pop nodes from the stack until either the * stack is empty or a node that could have a * matching descendant is found. */ do { if (tos == 0) return (NULL); node = stack[--tos]; } while (pctrie_slot(index, node->pn_clev) == 0); /* * The following computation cannot overflow * because index's slot at the current level * is greater than 0. */ index = pctrie_trimkey(index, node->pn_clev); } index--; KASSERT(!pctrie_keybarr(node, index), ("pctrie_lookup_le: keybarr failed")); } slot = pctrie_slot(index, node->pn_clev); child = node->pn_child[slot]; if (pctrie_isleaf(child)) { m = pctrie_toval(child); if (*m <= index) return (m); } else if (child != NULL) goto descend; /* * Look for an available edge or value within the current * bisection node. */ if (slot > 0) { inc = PCTRIE_UNITLEVEL(node->pn_clev); index |= inc - 1; do { index -= inc; slot--; child = node->pn_child[slot]; if (pctrie_isleaf(child)) { m = pctrie_toval(child); if (*m <= index) return (m); } else if (child != NULL) goto descend; } while (slot > 0); } KASSERT(child == NULL || pctrie_isleaf(child), ("pctrie_lookup_le: child is radix node")); /* * If a value or edge smaller than the search slot is not found * in the current node, ascend to the next higher-level node. */ goto ascend; descend: KASSERT(node->pn_clev > 0, ("pctrie_lookup_le: pushing leaf's parent")); KASSERT(tos < PCTRIE_LIMIT, ("pctrie_lookup_le: stack overflow")); stack[tos++] = node; node = child; } } /* * Remove the specified index from the tree. * Panics if the key is not present. */ void pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn) { struct pctrie_node *node, *parent; uint64_t *m; int i, slot; node = pctrie_getroot(ptree); if (pctrie_isleaf(node)) { m = pctrie_toval(node); if (*m != index) panic("%s: invalid key found", __func__); pctrie_setroot(ptree, NULL); return; } parent = NULL; for (;;) { if (node == NULL) panic("pctrie_remove: impossible to locate the key"); slot = pctrie_slot(index, node->pn_clev); if (pctrie_isleaf(node->pn_child[slot])) { m = pctrie_toval(node->pn_child[slot]); if (*m != index) panic("%s: invalid key found", __func__); node->pn_child[slot] = NULL; node->pn_count--; if (node->pn_count > 1) break; for (i = 0; i < PCTRIE_COUNT; i++) if (node->pn_child[i] != NULL) break; KASSERT(i != PCTRIE_COUNT, ("%s: invalid node configuration", __func__)); if (parent == NULL) pctrie_setroot(ptree, node->pn_child[i]); else { slot = pctrie_slot(index, parent->pn_clev); KASSERT(parent->pn_child[slot] == node, ("%s: invalid child value", __func__)); parent->pn_child[slot] = node->pn_child[i]; } node->pn_count--; node->pn_child[i] = NULL; pctrie_node_put(ptree, node, freefn); break; } parent = node; node = node->pn_child[slot]; } } /* * Remove and free all the nodes from the tree. * This function is recursive but there is a tight control on it as the * maximum depth of the tree is fixed. */ void pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn) { struct pctrie_node *root; root = pctrie_getroot(ptree); if (root == NULL) return; pctrie_setroot(ptree, NULL); if (!pctrie_isleaf(root)) pctrie_reclaim_allnodes_int(ptree, root, freefn); } #ifdef DDB /* * Show details about the given node. */ DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) { struct pctrie_node *node; int i; if (!have_addr) return; node = (struct pctrie_node *)addr; db_printf("node %p, owner %jx, children count %u, level %u:\n", (void *)node, (uintmax_t)node->pn_owner, node->pn_count, node->pn_clev); for (i = 0; i < PCTRIE_COUNT; i++) if (node->pn_child[i] != NULL) db_printf("slot: %d, val: %p, value: %p, clev: %d\n", i, (void *)node->pn_child[i], pctrie_isleaf(node->pn_child[i]) ? pctrie_toval(node->pn_child[i]) : NULL, node->pn_clev); } #endif /* DDB */ Index: head/sys/kern/subr_power.c =================================================================== --- head/sys/kern/subr_power.c (revision 326270) +++ head/sys/kern/subr_power.c (revision 326271) @@ -1,122 +1,124 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2001 Mitsuru IWASAKI * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include static u_int power_pm_type = POWER_PM_TYPE_NONE; static power_pm_fn_t power_pm_fn = NULL; static void *power_pm_arg = NULL; static struct task power_pm_task; static void power_pm_deferred_fn(void *arg, int pending) { int state = (intptr_t)arg; power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state); } int power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg) { int error; if (power_pm_type == POWER_PM_TYPE_NONE || power_pm_type == pm_type) { power_pm_type = pm_type; power_pm_fn = pm_fn; power_pm_arg = pm_arg; error = 0; TASK_INIT(&power_pm_task, 0, power_pm_deferred_fn, NULL); } else { error = ENXIO; } return (error); } u_int power_pm_get_type(void) { return (power_pm_type); } void power_pm_suspend(int state) { if (power_pm_fn == NULL) return; if (state != POWER_SLEEP_STATE_STANDBY && state != POWER_SLEEP_STATE_SUSPEND && state != POWER_SLEEP_STATE_HIBERNATE) return; power_pm_task.ta_context = (void *)(intptr_t)state; taskqueue_enqueue(taskqueue_thread, &power_pm_task); } /* * Power profile. */ static int power_profile_state = POWER_PROFILE_PERFORMANCE; int power_profile_get_state(void) { return (power_profile_state); } void power_profile_set_state(int state) { int changed; if (state != power_profile_state) { power_profile_state = state; changed = 1; if (bootverbose) { printf("system power profile changed to '%s'\n", (state == POWER_PROFILE_PERFORMANCE) ? "performance" : "economy"); } } else { changed = 0; } if (changed) EVENTHANDLER_INVOKE(power_profile_change, 0); } Index: head/sys/kern/subr_sbuf.c =================================================================== --- head/sys/kern/subr_sbuf.c (revision 326270) +++ head/sys/kern/subr_sbuf.c (revision 326271) @@ -1,882 +1,884 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000-2008 Poul-Henning Kamp * Copyright (c) 2000-2008 Dag-Erling Coïdan Smørgrav * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #else /* _KERNEL */ #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #ifdef _KERNEL static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers"); #define SBMALLOC(size) malloc(size, M_SBUF, M_WAITOK|M_ZERO) #define SBFREE(buf) free(buf, M_SBUF) #else /* _KERNEL */ #define KASSERT(e, m) #define SBMALLOC(size) calloc(1, size) #define SBFREE(buf) free(buf) #endif /* _KERNEL */ /* * Predicates */ #define SBUF_ISDYNAMIC(s) ((s)->s_flags & SBUF_DYNAMIC) #define SBUF_ISDYNSTRUCT(s) ((s)->s_flags & SBUF_DYNSTRUCT) #define SBUF_ISFINISHED(s) ((s)->s_flags & SBUF_FINISHED) #define SBUF_HASROOM(s) ((s)->s_len < (s)->s_size - 1) #define SBUF_FREESPACE(s) ((s)->s_size - ((s)->s_len + 1)) #define SBUF_CANEXTEND(s) ((s)->s_flags & SBUF_AUTOEXTEND) #define SBUF_ISSECTION(s) ((s)->s_flags & SBUF_INSECTION) #define SBUF_NULINCLUDED(s) ((s)->s_flags & SBUF_INCLUDENUL) #define SBUF_ISDRAINTOEOR(s) ((s)->s_flags & SBUF_DRAINTOEOR) #define SBUF_DODRAINTOEOR(s) (SBUF_ISSECTION(s) && SBUF_ISDRAINTOEOR(s)) /* * Set / clear flags */ #define SBUF_SETFLAG(s, f) do { (s)->s_flags |= (f); } while (0) #define SBUF_CLEARFLAG(s, f) do { (s)->s_flags &= ~(f); } while (0) #define SBUF_MINSIZE 2 /* Min is 1 byte + nulterm. */ #define SBUF_MINEXTENDSIZE 16 /* Should be power of 2. */ #ifdef PAGE_SIZE #define SBUF_MAXEXTENDSIZE PAGE_SIZE #define SBUF_MAXEXTENDINCR PAGE_SIZE #else #define SBUF_MAXEXTENDSIZE 4096 #define SBUF_MAXEXTENDINCR 4096 #endif /* * Debugging support */ #if defined(_KERNEL) && defined(INVARIANTS) static void _assert_sbuf_integrity(const char *fun, struct sbuf *s) { KASSERT(s != NULL, ("%s called with a NULL sbuf pointer", fun)); KASSERT(s->s_buf != NULL, ("%s called with uninitialized or corrupt sbuf", fun)); if (SBUF_ISFINISHED(s) && SBUF_NULINCLUDED(s)) { KASSERT(s->s_len <= s->s_size, ("wrote past end of sbuf (%jd >= %jd)", (intmax_t)s->s_len, (intmax_t)s->s_size)); } else { KASSERT(s->s_len < s->s_size, ("wrote past end of sbuf (%jd >= %jd)", (intmax_t)s->s_len, (intmax_t)s->s_size)); } } static void _assert_sbuf_state(const char *fun, struct sbuf *s, int state) { KASSERT((s->s_flags & SBUF_FINISHED) == state, ("%s called with %sfinished or corrupt sbuf", fun, (state ? "un" : ""))); } #define assert_sbuf_integrity(s) _assert_sbuf_integrity(__func__, (s)) #define assert_sbuf_state(s, i) _assert_sbuf_state(__func__, (s), (i)) #else /* _KERNEL && INVARIANTS */ #define assert_sbuf_integrity(s) do { } while (0) #define assert_sbuf_state(s, i) do { } while (0) #endif /* _KERNEL && INVARIANTS */ #ifdef CTASSERT CTASSERT(powerof2(SBUF_MAXEXTENDSIZE)); CTASSERT(powerof2(SBUF_MAXEXTENDINCR)); #endif static int sbuf_extendsize(int size) { int newsize; if (size < (int)SBUF_MAXEXTENDSIZE) { newsize = SBUF_MINEXTENDSIZE; while (newsize < size) newsize *= 2; } else { newsize = roundup2(size, SBUF_MAXEXTENDINCR); } KASSERT(newsize >= size, ("%s: %d < %d\n", __func__, newsize, size)); return (newsize); } /* * Extend an sbuf. */ static int sbuf_extend(struct sbuf *s, int addlen) { char *newbuf; int newsize; if (!SBUF_CANEXTEND(s)) return (-1); newsize = sbuf_extendsize(s->s_size + addlen); newbuf = SBMALLOC(newsize); if (newbuf == NULL) return (-1); memcpy(newbuf, s->s_buf, s->s_size); if (SBUF_ISDYNAMIC(s)) SBFREE(s->s_buf); else SBUF_SETFLAG(s, SBUF_DYNAMIC); s->s_buf = newbuf; s->s_size = newsize; return (0); } /* * Initialize the internals of an sbuf. * If buf is non-NULL, it points to a static or already-allocated string * big enough to hold at least length characters. */ static struct sbuf * sbuf_newbuf(struct sbuf *s, char *buf, int length, int flags) { memset(s, 0, sizeof(*s)); s->s_flags = flags; s->s_size = length; s->s_buf = buf; if ((s->s_flags & SBUF_AUTOEXTEND) == 0) { KASSERT(s->s_size >= SBUF_MINSIZE, ("attempt to create an sbuf smaller than %d bytes", SBUF_MINSIZE)); } if (s->s_buf != NULL) return (s); if ((flags & SBUF_AUTOEXTEND) != 0) s->s_size = sbuf_extendsize(s->s_size); s->s_buf = SBMALLOC(s->s_size); if (s->s_buf == NULL) return (NULL); SBUF_SETFLAG(s, SBUF_DYNAMIC); return (s); } /* * Initialize an sbuf. * If buf is non-NULL, it points to a static or already-allocated string * big enough to hold at least length characters. */ struct sbuf * sbuf_new(struct sbuf *s, char *buf, int length, int flags) { KASSERT(length >= 0, ("attempt to create an sbuf of negative length (%d)", length)); KASSERT((flags & ~SBUF_USRFLAGMSK) == 0, ("%s called with invalid flags", __func__)); flags &= SBUF_USRFLAGMSK; if (s != NULL) return (sbuf_newbuf(s, buf, length, flags)); s = SBMALLOC(sizeof(*s)); if (s == NULL) return (NULL); if (sbuf_newbuf(s, buf, length, flags) == NULL) { SBFREE(s); return (NULL); } SBUF_SETFLAG(s, SBUF_DYNSTRUCT); return (s); } #ifdef _KERNEL /* * Create an sbuf with uio data */ struct sbuf * sbuf_uionew(struct sbuf *s, struct uio *uio, int *error) { KASSERT(uio != NULL, ("%s called with NULL uio pointer", __func__)); KASSERT(error != NULL, ("%s called with NULL error pointer", __func__)); s = sbuf_new(s, NULL, uio->uio_resid + 1, 0); if (s == NULL) { *error = ENOMEM; return (NULL); } *error = uiomove(s->s_buf, uio->uio_resid, uio); if (*error != 0) { sbuf_delete(s); return (NULL); } s->s_len = s->s_size - 1; if (SBUF_ISSECTION(s)) s->s_sect_len = s->s_size - 1; *error = 0; return (s); } #endif int sbuf_get_flags(struct sbuf *s) { return (s->s_flags & SBUF_USRFLAGMSK); } void sbuf_clear_flags(struct sbuf *s, int flags) { s->s_flags &= ~(flags & SBUF_USRFLAGMSK); } void sbuf_set_flags(struct sbuf *s, int flags) { s->s_flags |= (flags & SBUF_USRFLAGMSK); } /* * Clear an sbuf and reset its position. */ void sbuf_clear(struct sbuf *s) { assert_sbuf_integrity(s); /* don't care if it's finished or not */ SBUF_CLEARFLAG(s, SBUF_FINISHED); s->s_error = 0; s->s_len = 0; s->s_rec_off = 0; s->s_sect_len = 0; } /* * Set the sbuf's end position to an arbitrary value. * Effectively truncates the sbuf at the new position. */ int sbuf_setpos(struct sbuf *s, ssize_t pos) { assert_sbuf_integrity(s); assert_sbuf_state(s, 0); KASSERT(pos >= 0, ("attempt to seek to a negative position (%jd)", (intmax_t)pos)); KASSERT(pos < s->s_size, ("attempt to seek past end of sbuf (%jd >= %jd)", (intmax_t)pos, (intmax_t)s->s_size)); KASSERT(!SBUF_ISSECTION(s), ("attempt to seek when in a section")); if (pos < 0 || pos > s->s_len) return (-1); s->s_len = pos; return (0); } /* * Set up a drain function and argument on an sbuf to flush data to * when the sbuf buffer overflows. */ void sbuf_set_drain(struct sbuf *s, sbuf_drain_func *func, void *ctx) { assert_sbuf_state(s, 0); assert_sbuf_integrity(s); KASSERT(func == s->s_drain_func || s->s_len == 0, ("Cannot change drain to %p on non-empty sbuf %p", func, s)); s->s_drain_func = func; s->s_drain_arg = ctx; } /* * Call the drain and process the return. */ static int sbuf_drain(struct sbuf *s) { int len; KASSERT(s->s_len > 0, ("Shouldn't drain empty sbuf %p", s)); KASSERT(s->s_error == 0, ("Called %s with error on %p", __func__, s)); if (SBUF_DODRAINTOEOR(s) && s->s_rec_off == 0) return (s->s_error = EDEADLK); len = s->s_drain_func(s->s_drain_arg, s->s_buf, SBUF_DODRAINTOEOR(s) ? s->s_rec_off : s->s_len); if (len <= 0) { s->s_error = len ? -len : EDEADLK; return (s->s_error); } KASSERT(len > 0 && len <= s->s_len, ("Bad drain amount %d for sbuf %p", len, s)); s->s_len -= len; s->s_rec_off -= len; /* * Fast path for the expected case where all the data was * drained. */ if (s->s_len == 0) return (0); /* * Move the remaining characters to the beginning of the * string. */ memmove(s->s_buf, s->s_buf + len, s->s_len); return (0); } /* * Append bytes to an sbuf. This is the core function for appending * to an sbuf and is the main place that deals with extending the * buffer and marking overflow. */ static void sbuf_put_bytes(struct sbuf *s, const char *buf, size_t len) { size_t n; assert_sbuf_integrity(s); assert_sbuf_state(s, 0); if (s->s_error != 0) return; while (len > 0) { if (SBUF_FREESPACE(s) <= 0) { /* * If there is a drain, use it, otherwise extend the * buffer. */ if (s->s_drain_func != NULL) (void)sbuf_drain(s); else if (sbuf_extend(s, len > INT_MAX ? INT_MAX : len) < 0) s->s_error = ENOMEM; if (s->s_error != 0) return; } n = SBUF_FREESPACE(s); if (len < n) n = len; memcpy(&s->s_buf[s->s_len], buf, n); s->s_len += n; if (SBUF_ISSECTION(s)) s->s_sect_len += n; len -= n; buf += n; } } static void sbuf_put_byte(struct sbuf *s, char c) { sbuf_put_bytes(s, &c, 1); } /* * Append a byte string to an sbuf. */ int sbuf_bcat(struct sbuf *s, const void *buf, size_t len) { sbuf_put_bytes(s, buf, len); if (s->s_error != 0) return (-1); return (0); } #ifdef _KERNEL /* * Copy a byte string from userland into an sbuf. */ int sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len) { assert_sbuf_integrity(s); assert_sbuf_state(s, 0); KASSERT(s->s_drain_func == NULL, ("Nonsensical copyin to sbuf %p with a drain", s)); if (s->s_error != 0) return (-1); if (len == 0) return (0); if (len > SBUF_FREESPACE(s)) { sbuf_extend(s, len - SBUF_FREESPACE(s)); if (SBUF_FREESPACE(s) < len) len = SBUF_FREESPACE(s); } if (copyin(uaddr, s->s_buf + s->s_len, len) != 0) return (-1); s->s_len += len; return (0); } #endif /* * Copy a byte string into an sbuf. */ int sbuf_bcpy(struct sbuf *s, const void *buf, size_t len) { assert_sbuf_integrity(s); assert_sbuf_state(s, 0); sbuf_clear(s); return (sbuf_bcat(s, buf, len)); } /* * Append a string to an sbuf. */ int sbuf_cat(struct sbuf *s, const char *str) { size_t n; n = strlen(str); sbuf_put_bytes(s, str, n); if (s->s_error != 0) return (-1); return (0); } #ifdef _KERNEL /* * Append a string from userland to an sbuf. */ int sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len) { size_t done; assert_sbuf_integrity(s); assert_sbuf_state(s, 0); KASSERT(s->s_drain_func == NULL, ("Nonsensical copyin to sbuf %p with a drain", s)); if (s->s_error != 0) return (-1); if (len == 0) len = SBUF_FREESPACE(s); /* XXX return 0? */ if (len > SBUF_FREESPACE(s)) { sbuf_extend(s, len); if (SBUF_FREESPACE(s) < len) len = SBUF_FREESPACE(s); } switch (copyinstr(uaddr, s->s_buf + s->s_len, len + 1, &done)) { case ENAMETOOLONG: s->s_error = ENOMEM; /* fall through */ case 0: s->s_len += done - 1; if (SBUF_ISSECTION(s)) s->s_sect_len += done - 1; break; default: return (-1); /* XXX */ } return (done); } #endif /* * Copy a string into an sbuf. */ int sbuf_cpy(struct sbuf *s, const char *str) { assert_sbuf_integrity(s); assert_sbuf_state(s, 0); sbuf_clear(s); return (sbuf_cat(s, str)); } /* * Format the given argument list and append the resulting string to an sbuf. */ #ifdef _KERNEL /* * Append a non-NUL character to an sbuf. This prototype signature is * suitable for use with kvprintf(9). */ static void sbuf_putc_func(int c, void *arg) { if (c != '\0') sbuf_put_byte(arg, c); } int sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap) { assert_sbuf_integrity(s); assert_sbuf_state(s, 0); KASSERT(fmt != NULL, ("%s called with a NULL format string", __func__)); (void)kvprintf(fmt, sbuf_putc_func, s, 10, ap); if (s->s_error != 0) return (-1); return (0); } #else /* !_KERNEL */ int sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap) { va_list ap_copy; int error, len; assert_sbuf_integrity(s); assert_sbuf_state(s, 0); KASSERT(fmt != NULL, ("%s called with a NULL format string", __func__)); if (s->s_error != 0) return (-1); /* * For the moment, there is no way to get vsnprintf(3) to hand * back a character at a time, to push everything into * sbuf_putc_func() as was done for the kernel. * * In userspace, while drains are useful, there's generally * not a problem attempting to malloc(3) on out of space. So * expand a userland sbuf if there is not enough room for the * data produced by sbuf_[v]printf(3). */ error = 0; do { va_copy(ap_copy, ap); len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1, fmt, ap_copy); if (len < 0) { s->s_error = errno; return (-1); } va_end(ap_copy); if (SBUF_FREESPACE(s) >= len) break; /* Cannot print with the current available space. */ if (s->s_drain_func != NULL && s->s_len > 0) error = sbuf_drain(s); /* sbuf_drain() sets s_error. */ else if (sbuf_extend(s, len - SBUF_FREESPACE(s)) != 0) s->s_error = error = ENOMEM; } while (error == 0); /* * s->s_len is the length of the string, without the terminating nul. * When updating s->s_len, we must subtract 1 from the length that * we passed into vsnprintf() because that length includes the * terminating nul. * * vsnprintf() returns the amount that would have been copied, * given sufficient space, so don't over-increment s_len. */ if (SBUF_FREESPACE(s) < len) len = SBUF_FREESPACE(s); s->s_len += len; if (SBUF_ISSECTION(s)) s->s_sect_len += len; KASSERT(s->s_len < s->s_size, ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size)); if (s->s_error != 0) return (-1); return (0); } #endif /* _KERNEL */ /* * Format the given arguments and append the resulting string to an sbuf. */ int sbuf_printf(struct sbuf *s, const char *fmt, ...) { va_list ap; int result; va_start(ap, fmt); result = sbuf_vprintf(s, fmt, ap); va_end(ap); return (result); } /* * Append a character to an sbuf. */ int sbuf_putc(struct sbuf *s, int c) { sbuf_put_byte(s, c); if (s->s_error != 0) return (-1); return (0); } /* * Trim whitespace characters from end of an sbuf. */ int sbuf_trim(struct sbuf *s) { assert_sbuf_integrity(s); assert_sbuf_state(s, 0); KASSERT(s->s_drain_func == NULL, ("%s makes no sense on sbuf %p with drain", __func__, s)); if (s->s_error != 0) return (-1); while (s->s_len > 0 && isspace(s->s_buf[s->s_len-1])) { --s->s_len; if (SBUF_ISSECTION(s)) s->s_sect_len--; } return (0); } /* * Check if an sbuf has an error. */ int sbuf_error(const struct sbuf *s) { return (s->s_error); } /* * Finish off an sbuf. */ int sbuf_finish(struct sbuf *s) { assert_sbuf_integrity(s); assert_sbuf_state(s, 0); s->s_buf[s->s_len] = '\0'; if (SBUF_NULINCLUDED(s)) s->s_len++; if (s->s_drain_func != NULL) { while (s->s_len > 0 && s->s_error == 0) s->s_error = sbuf_drain(s); } SBUF_SETFLAG(s, SBUF_FINISHED); #ifdef _KERNEL return (s->s_error); #else if (s->s_error != 0) { errno = s->s_error; return (-1); } return (0); #endif } /* * Return a pointer to the sbuf data. */ char * sbuf_data(struct sbuf *s) { assert_sbuf_integrity(s); assert_sbuf_state(s, SBUF_FINISHED); KASSERT(s->s_drain_func == NULL, ("%s makes no sense on sbuf %p with drain", __func__, s)); return (s->s_buf); } /* * Return the length of the sbuf data. */ ssize_t sbuf_len(struct sbuf *s) { assert_sbuf_integrity(s); /* don't care if it's finished or not */ KASSERT(s->s_drain_func == NULL, ("%s makes no sense on sbuf %p with drain", __func__, s)); if (s->s_error != 0) return (-1); /* If finished, nulterm is already in len, else add one. */ if (SBUF_NULINCLUDED(s) && !SBUF_ISFINISHED(s)) return (s->s_len + 1); return (s->s_len); } /* * Clear an sbuf, free its buffer if necessary. */ void sbuf_delete(struct sbuf *s) { int isdyn; assert_sbuf_integrity(s); /* don't care if it's finished or not */ if (SBUF_ISDYNAMIC(s)) SBFREE(s->s_buf); isdyn = SBUF_ISDYNSTRUCT(s); memset(s, 0, sizeof(*s)); if (isdyn) SBFREE(s); } /* * Check if an sbuf has been finished. */ int sbuf_done(const struct sbuf *s) { return (SBUF_ISFINISHED(s)); } /* * Start a section. */ void sbuf_start_section(struct sbuf *s, ssize_t *old_lenp) { assert_sbuf_integrity(s); assert_sbuf_state(s, 0); if (!SBUF_ISSECTION(s)) { KASSERT(s->s_sect_len == 0, ("s_sect_len != 0 when starting a section")); if (old_lenp != NULL) *old_lenp = -1; s->s_rec_off = s->s_len; SBUF_SETFLAG(s, SBUF_INSECTION); } else { KASSERT(old_lenp != NULL, ("s_sect_len should be saved when starting a subsection")); *old_lenp = s->s_sect_len; s->s_sect_len = 0; } } /* * End the section padding to the specified length with the specified * character. */ ssize_t sbuf_end_section(struct sbuf *s, ssize_t old_len, size_t pad, int c) { ssize_t len; assert_sbuf_integrity(s); assert_sbuf_state(s, 0); KASSERT(SBUF_ISSECTION(s), ("attempt to end a section when not in a section")); if (pad > 1) { len = roundup(s->s_sect_len, pad) - s->s_sect_len; for (; s->s_error == 0 && len > 0; len--) sbuf_put_byte(s, c); } len = s->s_sect_len; if (old_len == -1) { s->s_rec_off = s->s_sect_len = 0; SBUF_CLEARFLAG(s, SBUF_INSECTION); } else { s->s_sect_len += old_len; } if (s->s_error != 0) return (-1); return (len); } Index: head/sys/kern/subr_sleepqueue.c =================================================================== --- head/sys/kern/subr_sleepqueue.c (revision 326270) +++ head/sys/kern/subr_sleepqueue.c (revision 326271) @@ -1,1453 +1,1455 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Implementation of sleep queues used to hold queue of threads blocked on * a wait channel. Sleep queues are different from turnstiles in that wait * channels are not owned by anyone, so there is no priority propagation. * Sleep queues can also provide a timeout and can also be interrupted by * signals. That said, there are several similarities between the turnstile * and sleep queue implementations. (Note: turnstiles were implemented * first.) For example, both use a hash table of the same size where each * bucket is referred to as a "chain" that contains both a spin lock and * a linked list of queues. An individual queue is located by using a hash * to pick a chain, locking the chain, and then walking the chain searching * for the queue. This means that a wait channel object does not need to * embed its queue head just as locks do not embed their turnstile queue * head. Threads also carry around a sleep queue that they lend to the * wait channel when blocking. Just as in turnstiles, the queue includes * a free list of the sleep queues of other threads blocked on the same * wait channel in the case of multiple waiters. * * Some additional functionality provided by sleep queues include the * ability to set a timeout. The timeout is managed using a per-thread * callout that resumes a thread if it is asleep. A thread may also * catch signals while it is asleep (aka an interruptible sleep). The * signal code uses sleepq_abort() to interrupt a sleeping thread. Finally, * sleep queues also provide some extra assertions. One is not allowed to * mix the sleep/wakeup and cv APIs for a given wait channel. Also, one * must consistently use the same lock to synchronize with a wait channel, * though this check is currently only a warning for sleep/wakeup due to * pre-existing abuse of that API. The same lock must also be held when * awakening threads, though that is currently only enforced for condition * variables. */ #include __FBSDID("$FreeBSD$"); #include "opt_sleepqueue_profiling.h" #include "opt_ddb.h" #include "opt_sched.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* * Constants for the hash table of sleep queue chains. * SC_TABLESIZE must be a power of two for SC_MASK to work properly. */ #ifndef SC_TABLESIZE #define SC_TABLESIZE 256 #endif CTASSERT(powerof2(SC_TABLESIZE)); #define SC_MASK (SC_TABLESIZE - 1) #define SC_SHIFT 8 #define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \ SC_MASK) #define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)] #define NR_SLEEPQS 2 /* * There are two different lists of sleep queues. Both lists are connected * via the sq_hash entries. The first list is the sleep queue chain list * that a sleep queue is on when it is attached to a wait channel. The * second list is the free list hung off of a sleep queue that is attached * to a wait channel. * * Each sleep queue also contains the wait channel it is attached to, the * list of threads blocked on that wait channel, flags specific to the * wait channel, and the lock used to synchronize with a wait channel. * The flags are used to catch mismatches between the various consumers * of the sleep queue API (e.g. sleep/wakeup and condition variables). * The lock pointer is only used when invariants are enabled for various * debugging checks. * * Locking key: * c - sleep queue chain lock */ struct sleepqueue { TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */ void *sq_wchan; /* (c) Wait channel. */ int sq_type; /* (c) Queue type. */ #ifdef INVARIANTS struct lock_object *sq_lock; /* (c) Associated lock. */ #endif }; struct sleepqueue_chain { LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */ struct mtx sc_lock; /* Spin lock for this chain. */ #ifdef SLEEPQUEUE_PROFILING u_int sc_depth; /* Length of sc_queues. */ u_int sc_max_depth; /* Max length of sc_queues. */ #endif } __aligned(CACHE_LINE_SIZE); #ifdef SLEEPQUEUE_PROFILING u_int sleepq_max_depth; static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling"); static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0, "sleepq chain stats"); SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth, 0, "maxmimum depth achieved of a single chain"); static void sleepq_profile(const char *wmesg); static int prof_enabled; #endif static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE]; static uma_zone_t sleepq_zone; /* * Prototypes for non-exported routines. */ static int sleepq_catch_signals(void *wchan, int pri); static int sleepq_check_signals(void); static int sleepq_check_timeout(void); #ifdef INVARIANTS static void sleepq_dtor(void *mem, int size, void *arg); #endif static int sleepq_init(void *mem, int size, int flags); static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri); static void sleepq_switch(void *wchan, int pri); static void sleepq_timeout(void *arg); SDT_PROBE_DECLARE(sched, , , sleep); SDT_PROBE_DECLARE(sched, , , wakeup); /* * Initialize SLEEPQUEUE_PROFILING specific sysctl nodes. * Note that it must happen after sleepinit() has been fully executed, so * it must happen after SI_SUB_KMEM SYSINIT() subsystem setup. */ #ifdef SLEEPQUEUE_PROFILING static void init_sleepqueue_profiling(void) { char chain_name[10]; struct sysctl_oid *chain_oid; u_int i; for (i = 0; i < SC_TABLESIZE; i++) { snprintf(chain_name, sizeof(chain_name), "%u", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO, chain_name, CTLFLAG_RD, NULL, "sleepq chain stats"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0, NULL); } } SYSINIT(sleepqueue_profiling, SI_SUB_LOCK, SI_ORDER_ANY, init_sleepqueue_profiling, NULL); #endif /* * Early initialization of sleep queues that is called from the sleepinit() * SYSINIT. */ void init_sleepqueues(void) { int i; for (i = 0; i < SC_TABLESIZE; i++) { LIST_INIT(&sleepq_chains[i].sc_queues); mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL, MTX_SPIN | MTX_RECURSE); } sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue), #ifdef INVARIANTS NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0); #else NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0); #endif thread0.td_sleepqueue = sleepq_alloc(); } /* * Get a sleep queue for a new thread. */ struct sleepqueue * sleepq_alloc(void) { return (uma_zalloc(sleepq_zone, M_WAITOK)); } /* * Free a sleep queue when a thread is destroyed. */ void sleepq_free(struct sleepqueue *sq) { uma_zfree(sleepq_zone, sq); } /* * Lock the sleep queue chain associated with the specified wait channel. */ void sleepq_lock(void *wchan) { struct sleepqueue_chain *sc; sc = SC_LOOKUP(wchan); mtx_lock_spin(&sc->sc_lock); } /* * Look up the sleep queue associated with a given wait channel in the hash * table locking the associated sleep queue chain. If no queue is found in * the table, NULL is returned. */ struct sleepqueue * sleepq_lookup(void *wchan) { struct sleepqueue_chain *sc; struct sleepqueue *sq; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) if (sq->sq_wchan == wchan) return (sq); return (NULL); } /* * Unlock the sleep queue chain associated with a given wait channel. */ void sleepq_release(void *wchan) { struct sleepqueue_chain *sc; sc = SC_LOOKUP(wchan); mtx_unlock_spin(&sc->sc_lock); } /* * Places the current thread on the sleep queue for the specified wait * channel. If INVARIANTS is enabled, then it associates the passed in * lock with the sleepq to make sure it is held when that sleep queue is * woken up. */ void sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags, int queue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(td->td_sleepqueue != NULL); MPASS(wchan != NULL); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); /* If this thread is not allowed to sleep, die a horrible death. */ KASSERT(td->td_no_sleeping == 0, ("%s: td %p to sleep on wchan %p with sleeping prohibited", __func__, td, wchan)); /* Look up the sleep queue associated with the wait channel 'wchan'. */ sq = sleepq_lookup(wchan); /* * If the wait channel does not already have a sleep queue, use * this thread's sleep queue. Otherwise, insert the current thread * into the sleep queue already in use by this wait channel. */ if (sq == NULL) { #ifdef INVARIANTS int i; sq = td->td_sleepqueue; for (i = 0; i < NR_SLEEPQS; i++) { KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]), ("thread's sleep queue %d is not empty", i)); KASSERT(sq->sq_blockedcnt[i] == 0, ("thread's sleep queue %d count mismatches", i)); } KASSERT(LIST_EMPTY(&sq->sq_free), ("thread's sleep queue has a non-empty free list")); KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer")); sq->sq_lock = lock; #endif #ifdef SLEEPQUEUE_PROFILING sc->sc_depth++; if (sc->sc_depth > sc->sc_max_depth) { sc->sc_max_depth = sc->sc_depth; if (sc->sc_max_depth > sleepq_max_depth) sleepq_max_depth = sc->sc_max_depth; } #endif sq = td->td_sleepqueue; LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash); sq->sq_wchan = wchan; sq->sq_type = flags & SLEEPQ_TYPE; } else { MPASS(wchan == sq->sq_wchan); MPASS(lock == sq->sq_lock); MPASS((flags & SLEEPQ_TYPE) == sq->sq_type); LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash); } thread_lock(td); TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq); sq->sq_blockedcnt[queue]++; td->td_sleepqueue = NULL; td->td_sqqueue = queue; td->td_wchan = wchan; td->td_wmesg = wmesg; if (flags & SLEEPQ_INTERRUPTIBLE) { td->td_flags |= TDF_SINTR; td->td_flags &= ~TDF_SLEEPABORT; } thread_unlock(td); } /* * Sets a timeout that will remove the current thread from the specified * sleep queue after timo ticks if the thread has not already been awakened. */ void sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr, int flags) { struct sleepqueue_chain *sc; struct thread *td; sbintime_t pr1; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_sleepqueue == NULL); MPASS(wchan != NULL); if (cold && td == &thread0) panic("timed sleep before timers are working"); KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx", td->td_tid, td, (uintmax_t)td->td_sleeptimo)); thread_lock(td); callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1); thread_unlock(td); callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1, sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC | C_DIRECT_EXEC); } /* * Return the number of actual sleepers for the specified queue. */ u_int sleepq_sleepcnt(void *wchan, int queue) { struct sleepqueue *sq; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); return (sq->sq_blockedcnt[queue]); } /* * Marks the pending sleep of the current thread as interruptible and * makes an initial check for pending signals before putting a thread * to sleep. Enters and exits with the thread lock held. Thread lock * may have transitioned from the sleepq lock to a run lock. */ static int sleepq_catch_signals(void *wchan, int pri) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; struct proc *p; struct sigacts *ps; int sig, ret; ret = 0; td = curthread; p = curproc; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(wchan != NULL); if ((td->td_pflags & TDP_WAKEUP) != 0) { td->td_pflags &= ~TDP_WAKEUP; ret = EINTR; thread_lock(td); goto out; } /* * See if there are any pending signals or suspension requests for this * thread. If not, we can switch immediately. */ thread_lock(td); if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) != 0) { thread_unlock(td); mtx_unlock_spin(&sc->sc_lock); CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)", (void *)td, (long)p->p_pid, td->td_name); PROC_LOCK(p); /* * Check for suspension first. Checking for signals and then * suspending could result in a missed signal, since a signal * can be delivered while this thread is suspended. */ if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { ret = thread_suspend_check(1); MPASS(ret == 0 || ret == EINTR || ret == ERESTART); if (ret != 0) { PROC_UNLOCK(p); mtx_lock_spin(&sc->sc_lock); thread_lock(td); goto out; } } if ((td->td_flags & TDF_NEEDSIGCHK) != 0) { ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig = cursig(td); if (sig == -1) { mtx_unlock(&ps->ps_mtx); KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); KASSERT(TD_SBDRY_INTR(td), ("lost TDF_SERESTART of TDF_SEINTR")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); ret = TD_SBDRY_ERRNO(td); } else if (sig != 0) { ret = SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART; mtx_unlock(&ps->ps_mtx); } else { mtx_unlock(&ps->ps_mtx); } } /* * Lock the per-process spinlock prior to dropping the PROC_LOCK * to avoid a signal delivery race. PROC_LOCK, PROC_SLOCK, and * thread_lock() are currently held in tdsendsignal(). */ PROC_SLOCK(p); mtx_lock_spin(&sc->sc_lock); PROC_UNLOCK(p); thread_lock(td); PROC_SUNLOCK(p); } if (ret == 0) { sleepq_switch(wchan, pri); return (0); } out: /* * There were pending signals and this thread is still * on the sleep queue, remove it from the sleep queue. */ if (TD_ON_SLEEPQ(td)) { sq = sleepq_lookup(wchan); if (sleepq_resume_thread(sq, td, 0)) { #ifdef INVARIANTS /* * This thread hasn't gone to sleep yet, so it * should not be swapped out. */ panic("not waking up swapper"); #endif } } mtx_unlock_spin(&sc->sc_lock); MPASS(td->td_lock != &sc->sc_lock); return (ret); } /* * Switches to another thread if we are still asleep on a sleep queue. * Returns with thread lock. */ static void sleepq_switch(void *wchan, int pri) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; bool rtc_changed; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If we have a sleep queue, then we've already been woken up, so * just return. */ if (td->td_sleepqueue != NULL) { mtx_unlock_spin(&sc->sc_lock); return; } /* * If TDF_TIMEOUT is set, then our sleep has been timed out * already but we are still on the sleep queue, so dequeue the * thread and return. * * Do the same if the real-time clock has been adjusted since this * thread calculated its timeout based on that clock. This handles * the following race: * - The Ts thread needs to sleep until an absolute real-clock time. * It copies the global rtc_generation into curthread->td_rtcgen, * reads the RTC, and calculates a sleep duration based on that time. * See umtxq_sleep() for an example. * - The Tc thread adjusts the RTC, bumps rtc_generation, and wakes * threads that are sleeping until an absolute real-clock time. * See tc_setclock() and the POSIX specification of clock_settime(). * - Ts reaches the code below. It holds the sleepqueue chain lock, * so Tc has finished waking, so this thread must test td_rtcgen. * (The declaration of td_rtcgen refers to this comment.) */ rtc_changed = td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation; if ((td->td_flags & TDF_TIMEOUT) || rtc_changed) { if (rtc_changed) { td->td_rtcgen = 0; } MPASS(TD_ON_SLEEPQ(td)); sq = sleepq_lookup(wchan); if (sleepq_resume_thread(sq, td, 0)) { #ifdef INVARIANTS /* * This thread hasn't gone to sleep yet, so it * should not be swapped out. */ panic("not waking up swapper"); #endif } mtx_unlock_spin(&sc->sc_lock); return; } #ifdef SLEEPQUEUE_PROFILING if (prof_enabled) sleepq_profile(td->td_wmesg); #endif MPASS(td->td_sleepqueue == NULL); sched_sleep(td, pri); thread_lock_set(td, &sc->sc_lock); SDT_PROBE0(sched, , , sleep); TD_SET_SLEEPING(td); mi_switch(SW_VOL | SWT_SLEEPQ, NULL); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); } /* * Check to see if we timed out. */ static int sleepq_check_timeout(void) { struct thread *td; int res; td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If TDF_TIMEOUT is set, we timed out. But recheck * td_sleeptimo anyway. */ res = 0; if (td->td_sleeptimo != 0) { if (td->td_sleeptimo <= sbinuptime()) res = EWOULDBLOCK; td->td_sleeptimo = 0; } if (td->td_flags & TDF_TIMEOUT) td->td_flags &= ~TDF_TIMEOUT; else /* * We ignore the situation where timeout subsystem was * unable to stop our callout. The struct thread is * type-stable, the callout will use the correct * memory when running. The checks of the * td_sleeptimo value in this function and in * sleepq_timeout() ensure that the thread does not * get spurious wakeups, even if the callout was reset * or thread reused. */ callout_stop(&td->td_slpcallout); return (res); } /* * Check to see if we were awoken by a signal. */ static int sleepq_check_signals(void) { struct thread *td; td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* We are no longer in an interruptible sleep. */ if (td->td_flags & TDF_SINTR) td->td_flags &= ~TDF_SINTR; if (td->td_flags & TDF_SLEEPABORT) { td->td_flags &= ~TDF_SLEEPABORT; return (td->td_intrval); } return (0); } /* * Block the current thread until it is awakened from its sleep queue. */ void sleepq_wait(void *wchan, int pri) { struct thread *td; td = curthread; MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); thread_unlock(td); } /* * Block the current thread until it is awakened from its sleep queue * or it is interrupted by a signal. */ int sleepq_wait_sig(void *wchan, int pri) { int rcatch; int rval; rcatch = sleepq_catch_signals(wchan, pri); rval = sleepq_check_signals(); thread_unlock(curthread); if (rcatch) return (rcatch); return (rval); } /* * Block the current thread until it is awakened from its sleep queue * or it times out while waiting. */ int sleepq_timedwait(void *wchan, int pri) { struct thread *td; int rval; td = curthread; MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); rval = sleepq_check_timeout(); thread_unlock(td); return (rval); } /* * Block the current thread until it is awakened from its sleep queue, * it is interrupted by a signal, or it times out waiting to be awakened. */ int sleepq_timedwait_sig(void *wchan, int pri) { int rcatch, rvalt, rvals; rcatch = sleepq_catch_signals(wchan, pri); rvalt = sleepq_check_timeout(); rvals = sleepq_check_signals(); thread_unlock(curthread); if (rcatch) return (rcatch); if (rvals) return (rvals); return (rvalt); } /* * Returns the type of sleepqueue given a waitchannel. */ int sleepq_type(void *wchan) { struct sleepqueue *sq; int type; MPASS(wchan != NULL); sleepq_lock(wchan); sq = sleepq_lookup(wchan); if (sq == NULL) { sleepq_release(wchan); return (-1); } type = sq->sq_type; sleepq_release(wchan); return (type); } /* * Removes a thread from a sleep queue and makes it * runnable. */ static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri) { struct sleepqueue_chain *sc; MPASS(td != NULL); MPASS(sq->sq_wchan != NULL); MPASS(td->td_wchan == sq->sq_wchan); MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0); THREAD_LOCK_ASSERT(td, MA_OWNED); sc = SC_LOOKUP(sq->sq_wchan); mtx_assert(&sc->sc_lock, MA_OWNED); SDT_PROBE2(sched, , , wakeup, td, td->td_proc); /* Remove the thread from the queue. */ sq->sq_blockedcnt[td->td_sqqueue]--; TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq); /* * Get a sleep queue for this thread. If this is the last waiter, * use the queue itself and take it out of the chain, otherwise, * remove a queue from the free list. */ if (LIST_EMPTY(&sq->sq_free)) { td->td_sleepqueue = sq; #ifdef INVARIANTS sq->sq_wchan = NULL; #endif #ifdef SLEEPQUEUE_PROFILING sc->sc_depth--; #endif } else td->td_sleepqueue = LIST_FIRST(&sq->sq_free); LIST_REMOVE(td->td_sleepqueue, sq_hash); td->td_wmesg = NULL; td->td_wchan = NULL; td->td_flags &= ~TDF_SINTR; CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, td->td_name); /* Adjust priority if requested. */ MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX)); if (pri != 0 && td->td_priority > pri && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, pri); /* * Note that thread td might not be sleeping if it is running * sleepq_catch_signals() on another CPU or is blocked on its * proc lock to check signals. There's no need to mark the * thread runnable in that case. */ if (TD_IS_SLEEPING(td)) { TD_CLR_SLEEPING(td); return (setrunnable(td)); } return (0); } #ifdef INVARIANTS /* * UMA zone item deallocator. */ static void sleepq_dtor(void *mem, int size, void *arg) { struct sleepqueue *sq; int i; sq = mem; for (i = 0; i < NR_SLEEPQS; i++) { MPASS(TAILQ_EMPTY(&sq->sq_blocked[i])); MPASS(sq->sq_blockedcnt[i] == 0); } } #endif /* * UMA zone item initializer. */ static int sleepq_init(void *mem, int size, int flags) { struct sleepqueue *sq; int i; bzero(mem, size); sq = mem; for (i = 0; i < NR_SLEEPQS; i++) { TAILQ_INIT(&sq->sq_blocked[i]); sq->sq_blockedcnt[i] = 0; } LIST_INIT(&sq->sq_free); return (0); } /* * Find the highest priority thread sleeping on a wait channel and resume it. */ int sleepq_signal(void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; struct thread *td, *besttd; int wakeup_swapper; CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); /* * Find the highest priority thread on the queue. If there is a * tie, use the thread that first appears in the queue as it has * been sleeping the longest since threads are always added to * the tail of sleep queues. */ besttd = TAILQ_FIRST(&sq->sq_blocked[queue]); TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) { if (td->td_priority < besttd->td_priority) besttd = td; } MPASS(besttd != NULL); thread_lock(besttd); wakeup_swapper = sleepq_resume_thread(sq, besttd, pri); thread_unlock(besttd); return (wakeup_swapper); } static bool match_any(struct thread *td __unused) { return (true); } /* * Resume all threads sleeping on a specified wait channel. */ int sleepq_broadcast(void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); return (sleepq_remove_matching(sq, queue, match_any, pri)); } /* * Resume threads on the sleep queue that match the given predicate. */ int sleepq_remove_matching(struct sleepqueue *sq, int queue, bool (*matches)(struct thread *), int pri) { struct thread *td, *tdn; int wakeup_swapper; /* * The last thread will be given ownership of sq and may * re-enqueue itself before sleepq_resume_thread() returns, * so we must cache the "next" queue item at the beginning * of the final iteration. */ wakeup_swapper = 0; TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) { thread_lock(td); if (matches(td)) wakeup_swapper |= sleepq_resume_thread(sq, td, pri); thread_unlock(td); } return (wakeup_swapper); } /* * Time sleeping threads out. When the timeout expires, the thread is * removed from the sleep queue and made runnable if it is still asleep. */ static void sleepq_timeout(void *arg) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; void *wchan; int wakeup_swapper; td = arg; wakeup_swapper = 0; CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); thread_lock(td); if (td->td_sleeptimo > sbinuptime() || td->td_sleeptimo == 0) { /* * The thread does not want a timeout (yet). */ } else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { /* * See if the thread is asleep and get the wait * channel if it is. */ wchan = td->td_wchan; sc = SC_LOOKUP(wchan); THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); sq = sleepq_lookup(wchan); MPASS(sq != NULL); td->td_flags |= TDF_TIMEOUT; wakeup_swapper = sleepq_resume_thread(sq, td, 0); } else if (TD_ON_SLEEPQ(td)) { /* * If the thread is on the SLEEPQ but isn't sleeping * yet, it can either be on another CPU in between * sleepq_add() and one of the sleepq_*wait*() * routines or it can be in sleepq_catch_signals(). */ td->td_flags |= TDF_TIMEOUT; } thread_unlock(td); if (wakeup_swapper) kick_proc0(); } /* * Resumes a specific thread from the sleep queue associated with a specific * wait channel if it is on that queue. */ void sleepq_remove(struct thread *td, void *wchan) { struct sleepqueue *sq; int wakeup_swapper; /* * Look up the sleep queue for this wait channel, then re-check * that the thread is asleep on that channel, if it is not, then * bail. */ MPASS(wchan != NULL); sleepq_lock(wchan); sq = sleepq_lookup(wchan); /* * We can not lock the thread here as it may be sleeping on a * different sleepq. However, holding the sleepq lock for this * wchan can guarantee that we do not miss a wakeup for this * channel. The asserts below will catch any false positives. */ if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) { sleepq_release(wchan); return; } /* Thread is asleep on sleep queue sq, so wake it up. */ thread_lock(td); MPASS(sq != NULL); MPASS(td->td_wchan == wchan); wakeup_swapper = sleepq_resume_thread(sq, td, 0); thread_unlock(td); sleepq_release(wchan); if (wakeup_swapper) kick_proc0(); } /* * Abort a thread as if an interrupt had occurred. Only abort * interruptible waits (unfortunately it isn't safe to abort others). */ int sleepq_abort(struct thread *td, int intrval) { struct sleepqueue *sq; void *wchan; THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_flags & TDF_SINTR); MPASS(intrval == EINTR || intrval == ERESTART); /* * If the TDF_TIMEOUT flag is set, just leave. A * timeout is scheduled anyhow. */ if (td->td_flags & TDF_TIMEOUT) return (0); CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); td->td_intrval = intrval; td->td_flags |= TDF_SLEEPABORT; /* * If the thread has not slept yet it will find the signal in * sleepq_catch_signals() and call sleepq_resume_thread. Otherwise * we have to do it here. */ if (!TD_IS_SLEEPING(td)) return (0); wchan = td->td_wchan; MPASS(wchan != NULL); sq = sleepq_lookup(wchan); MPASS(sq != NULL); /* Thread is asleep on sleep queue sq, so wake it up. */ return (sleepq_resume_thread(sq, td, 0)); } void sleepq_chains_remove_matching(bool (*matches)(struct thread *)) { struct sleepqueue_chain *sc; struct sleepqueue *sq; int i, wakeup_swapper; wakeup_swapper = 0; for (sc = &sleepq_chains[0]; sc < sleepq_chains + SC_TABLESIZE; ++sc) { if (LIST_EMPTY(&sc->sc_queues)) { continue; } mtx_lock_spin(&sc->sc_lock); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) { for (i = 0; i < NR_SLEEPQS; ++i) { wakeup_swapper |= sleepq_remove_matching(sq, i, matches, 0); } } mtx_unlock_spin(&sc->sc_lock); } if (wakeup_swapper) { kick_proc0(); } } /* * Prints the stacks of all threads presently sleeping on wchan/queue to * the sbuf sb. Sets count_stacks_printed to the number of stacks actually * printed. Typically, this will equal the number of threads sleeping on the * queue, but may be less if sb overflowed before all stacks were printed. */ #ifdef STACK int sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue, int *count_stacks_printed) { struct thread *td, *td_next; struct sleepqueue *sq; struct stack **st; struct sbuf **td_infos; int i, stack_idx, error, stacks_to_allocate; bool finished, partial_print; error = 0; finished = false; partial_print = false; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); stacks_to_allocate = 10; for (i = 0; i < 3 && !finished ; i++) { /* We cannot malloc while holding the queue's spinlock, so * we do our mallocs now, and hope it is enough. If it * isn't, we will free these, drop the lock, malloc more, * and try again, up to a point. After that point we will * give up and report ENOMEM. We also cannot write to sb * during this time since the client may have set the * SBUF_AUTOEXTEND flag on their sbuf, which could cause a * malloc as we print to it. So we defer actually printing * to sb until after we drop the spinlock. */ /* Where we will store the stacks. */ st = malloc(sizeof(struct stack *) * stacks_to_allocate, M_TEMP, M_WAITOK); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) st[stack_idx] = stack_create(M_WAITOK); /* Where we will store the td name, tid, etc. */ td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate, M_TEMP, M_WAITOK); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) td_infos[stack_idx] = sbuf_new(NULL, NULL, MAXCOMLEN + sizeof(struct thread *) * 2 + 40, SBUF_FIXEDLEN); sleepq_lock(wchan); sq = sleepq_lookup(wchan); if (sq == NULL) { /* This sleepq does not exist; exit and return ENOENT. */ error = ENOENT; finished = true; sleepq_release(wchan); goto loop_end; } stack_idx = 0; /* Save thread info */ TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, td_next) { if (stack_idx >= stacks_to_allocate) goto loop_end; /* Note the td_lock is equal to the sleepq_lock here. */ stack_save_td(st[stack_idx], td); sbuf_printf(td_infos[stack_idx], "%d: %s %p", td->td_tid, td->td_name, td); ++stack_idx; } finished = true; sleepq_release(wchan); /* Print the stacks */ for (i = 0; i < stack_idx; i++) { sbuf_finish(td_infos[i]); sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i])); stack_sbuf_print(sb, st[i]); sbuf_printf(sb, "\n"); error = sbuf_error(sb); if (error == 0) *count_stacks_printed = stack_idx; } loop_end: if (!finished) sleepq_release(wchan); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) stack_destroy(st[stack_idx]); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) sbuf_delete(td_infos[stack_idx]); free(st, M_TEMP); free(td_infos, M_TEMP); stacks_to_allocate *= 10; } if (!finished && error == 0) error = ENOMEM; return (error); } #endif #ifdef SLEEPQUEUE_PROFILING #define SLEEPQ_PROF_LOCATIONS 1024 #define SLEEPQ_SBUFSIZE 512 struct sleepq_prof { LIST_ENTRY(sleepq_prof) sp_link; const char *sp_wmesg; long sp_count; }; LIST_HEAD(sqphead, sleepq_prof); struct sqphead sleepq_prof_free; struct sqphead sleepq_hash[SC_TABLESIZE]; static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS]; static struct mtx sleepq_prof_lock; MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN); static void sleepq_profile(const char *wmesg) { struct sleepq_prof *sp; mtx_lock_spin(&sleepq_prof_lock); if (prof_enabled == 0) goto unlock; LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link) if (sp->sp_wmesg == wmesg) goto done; sp = LIST_FIRST(&sleepq_prof_free); if (sp == NULL) goto unlock; sp->sp_wmesg = wmesg; LIST_REMOVE(sp, sp_link); LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link); done: sp->sp_count++; unlock: mtx_unlock_spin(&sleepq_prof_lock); return; } static void sleepq_prof_reset(void) { struct sleepq_prof *sp; int enabled; int i; mtx_lock_spin(&sleepq_prof_lock); enabled = prof_enabled; prof_enabled = 0; for (i = 0; i < SC_TABLESIZE; i++) LIST_INIT(&sleepq_hash[i]); LIST_INIT(&sleepq_prof_free); for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) { sp = &sleepq_profent[i]; sp->sp_wmesg = NULL; sp->sp_count = 0; LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link); } prof_enabled = enabled; mtx_unlock_spin(&sleepq_prof_lock); } static int enable_sleepq_prof(SYSCTL_HANDLER_ARGS) { int error, v; v = prof_enabled; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == prof_enabled) return (0); if (v == 1) sleepq_prof_reset(); mtx_lock_spin(&sleepq_prof_lock); prof_enabled = !!v; mtx_unlock_spin(&sleepq_prof_lock); return (0); } static int reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); sleepq_prof_reset(); return (0); } static int dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS) { struct sleepq_prof *sp; struct sbuf *sb; int enabled; int error; int i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req); sbuf_printf(sb, "\nwmesg\tcount\n"); enabled = prof_enabled; mtx_lock_spin(&sleepq_prof_lock); prof_enabled = 0; mtx_unlock_spin(&sleepq_prof_lock); for (i = 0; i < SC_TABLESIZE; i++) { LIST_FOREACH(sp, &sleepq_hash[i], sp_link) { sbuf_printf(sb, "%s\t%ld\n", sp->sp_wmesg, sp->sp_count); } } mtx_lock_spin(&sleepq_prof_lock); prof_enabled = enabled; mtx_unlock_spin(&sleepq_prof_lock); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics"); SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, reset_sleepq_prof_stats, "I", "Reset sleepqueue profiling statistics"); SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling"); #endif #ifdef DDB DB_SHOW_COMMAND(sleepq, db_show_sleepqueue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; #ifdef INVARIANTS struct lock_object *lock; #endif struct thread *td; void *wchan; int i; if (!have_addr) return; /* * First, see if there is an active sleep queue for the wait channel * indicated by the address. */ wchan = (void *)addr; sc = SC_LOOKUP(wchan); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) if (sq->sq_wchan == wchan) goto found; /* * Second, see if there is an active sleep queue at the address * indicated. */ for (i = 0; i < SC_TABLESIZE; i++) LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) { if (sq == (struct sleepqueue *)addr) goto found; } db_printf("Unable to locate a sleep queue via %p\n", (void *)addr); return; found: db_printf("Wait channel: %p\n", sq->sq_wchan); db_printf("Queue type: %d\n", sq->sq_type); #ifdef INVARIANTS if (sq->sq_lock) { lock = sq->sq_lock; db_printf("Associated Interlock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name, lock->lo_name); } #endif db_printf("Blocked threads:\n"); for (i = 0; i < NR_SLEEPQS; i++) { db_printf("\nQueue[%d]:\n", i); if (TAILQ_EMPTY(&sq->sq_blocked[i])) db_printf("\tempty\n"); else TAILQ_FOREACH(td, &sq->sq_blocked[i], td_slpq) { db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); } db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]); } } /* Alias 'show sleepqueue' to 'show sleepq'. */ DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue); #endif Index: head/sys/kern/subr_smp.c =================================================================== --- head/sys/kern/subr_smp.c (revision 326270) +++ head/sys/kern/subr_smp.c (revision 326271) @@ -1,1152 +1,1154 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2001, John Baldwin . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and machine independent functions * used for the kernel SMP support. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_sched.h" #ifdef SMP MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data"); volatile cpuset_t stopped_cpus; volatile cpuset_t started_cpus; volatile cpuset_t suspended_cpus; cpuset_t hlt_cpus_mask; cpuset_t logical_cpus_mask; void (*cpustop_restartfunc)(void); #endif static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS); /* This is used in modules that need to work in both SMP and UP. */ cpuset_t all_cpus; int mp_ncpus; /* export this for libkvm consumers. */ int mp_maxcpus = MAXCPU; volatile int smp_started; u_int mp_maxid; static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL, "Kernel SMP"); SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0, "Max CPU ID."); SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus, 0, "Max number of CPUs that the system was compiled for."); SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode"); int smp_disabled = 0; /* has smp been disabled? */ SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD, &smp_disabled, 0, "SMP has been disabled from the loader"); int smp_cpus = 1; /* how many cpu's running */ SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0, "Number of CPUs online"); int smp_topology = 0; /* Which topology we're using. */ SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0, "Topology override setting; 0 is default provided by hardware."); #ifdef SMP /* Enable forwarding of a signal to a process running on a different CPU */ static int forward_signal_enabled = 1; SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, &forward_signal_enabled, 0, "Forwarding of a signal to a process on a different CPU"); /* Variables needed for SMP rendezvous. */ static volatile int smp_rv_ncpus; static void (*volatile smp_rv_setup_func)(void *arg); static void (*volatile smp_rv_action_func)(void *arg); static void (*volatile smp_rv_teardown_func)(void *arg); static void *volatile smp_rv_func_arg; static volatile int smp_rv_waiters[4]; /* * Shared mutex to restrict busywaits between smp_rendezvous() and * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these * functions trigger at once and cause multiple CPUs to busywait with * interrupts disabled. */ struct mtx smp_ipi_mtx; /* * Let the MD SMP code initialize mp_maxid very early if it can. */ static void mp_setmaxid(void *dummy) { cpu_mp_setmaxid(); KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__)); KASSERT(mp_ncpus > 1 || mp_maxid == 0, ("%s: one CPU but mp_maxid is not zero", __func__)); KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL); /* * Call the MD SMP initialization code. */ static void mp_start(void *dummy) { mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN); /* Probe for MP hardware. */ if (smp_disabled != 0 || cpu_mp_probe() == 0) { mp_ncpus = 1; CPU_SETOF(PCPU_GET(cpuid), &all_cpus); return; } cpu_mp_start(); printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n", mp_ncpus); cpu_mp_announce(); } SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL); void forward_signal(struct thread *td) { int id; /* * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on * this thread, so all we need to do is poke it if it is currently * executing so that it executes ast(). */ THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("forward_signal: thread is not TDS_RUNNING")); CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc); if (!smp_started || cold || panicstr) return; if (!forward_signal_enabled) return; /* No need to IPI ourself. */ if (td == curthread) return; id = td->td_oncpu; if (id == NOCPU) return; ipi_cpu(id, IPI_AST); } /* * When called the executing CPU will send an IPI to all other CPUs * requesting that they halt execution. * * Usually (but not necessarily) called with 'other_cpus' as its arg. * * - Signals all CPUs in map to stop. * - Waits for each to stop. * * Returns: * -1: error * 0: NA * 1: ok * */ #if defined(__amd64__) || defined(__i386__) #define X86 1 #else #define X86 0 #endif static int generic_stop_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif static volatile u_int stopping_cpu = NOCPU; int i; volatile cpuset_t *cpus; KASSERT( type == IPI_STOP || type == IPI_STOP_HARD #if X86 || type == IPI_SUSPEND #endif , ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR2(KTR_SMP, "stop_cpus(%s) with %u type", cpusetobj_strprint(cpusetbuf, &map), type); #if X86 /* * When suspending, ensure there are are no IPIs in progress. * IPIs that have been issued, but not yet delivered (e.g. * not pending on a vCPU when running under virtualization) * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ if (type == IPI_SUSPEND) mtx_lock_spin(&smp_ipi_mtx); #endif #if X86 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { #endif if (stopping_cpu != PCPU_GET(cpuid)) while (atomic_cmpset_int(&stopping_cpu, NOCPU, PCPU_GET(cpuid)) == 0) while (stopping_cpu != NOCPU) cpu_spinwait(); /* spin */ /* send the stop IPI to all CPUs in map */ ipi_selected(map, type); #if X86 } #endif #if X86 if (type == IPI_SUSPEND) cpus = &suspended_cpus; else #endif cpus = &stopped_cpus; i = 0; while (!CPU_SUBSET(cpus, &map)) { /* spin */ cpu_spinwait(); i++; if (i == 100000000) { printf("timeout stopping cpus\n"); break; } } #if X86 if (type == IPI_SUSPEND) mtx_unlock_spin(&smp_ipi_mtx); #endif stopping_cpu = NOCPU; return (1); } int stop_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP)); } int stop_cpus_hard(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP_HARD)); } #if X86 int suspend_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_SUSPEND)); } #endif /* * Called by a CPU to restart stopped CPUs. * * Usually (but not necessarily) called with 'stopped_cpus' as its arg. * * - Signals all CPUs in map to restart. * - Waits for each to restart. * * Returns: * -1: error * 0: NA * 1: ok */ static int generic_restart_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif volatile cpuset_t *cpus; KASSERT(type == IPI_STOP || type == IPI_STOP_HARD #if X86 || type == IPI_SUSPEND #endif , ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); #if X86 if (type == IPI_SUSPEND) cpus = &suspended_cpus; else #endif cpus = &stopped_cpus; /* signal other cpus to restart */ CPU_COPY_STORE_REL(&map, &started_cpus); #if X86 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { #endif /* wait for each to clear its bit */ while (CPU_OVERLAP(cpus, &map)) cpu_spinwait(); #if X86 } #endif return (1); } int restart_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_STOP)); } #if X86 int resume_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_SUSPEND)); } #endif #undef X86 /* * All-CPU rendezvous. CPUs are signalled, all execute the setup function * (if specified), rendezvous, execute the action function (if specified), * rendezvous again, execute the teardown function (if specified), and then * resume. * * Note that the supplied external functions _must_ be reentrant and aware * that they are running in parallel and in an unknown lock context. */ void smp_rendezvous_action(void) { struct thread *td; void *local_func_arg; void (*local_setup_func)(void*); void (*local_action_func)(void*); void (*local_teardown_func)(void*); #ifdef INVARIANTS int owepreempt; #endif /* Ensure we have up-to-date values. */ atomic_add_acq_int(&smp_rv_waiters[0], 1); while (smp_rv_waiters[0] < smp_rv_ncpus) cpu_spinwait(); /* Fetch rendezvous parameters after acquire barrier. */ local_func_arg = smp_rv_func_arg; local_setup_func = smp_rv_setup_func; local_action_func = smp_rv_action_func; local_teardown_func = smp_rv_teardown_func; /* * Use a nested critical section to prevent any preemptions * from occurring during a rendezvous action routine. * Specifically, if a rendezvous handler is invoked via an IPI * and the interrupted thread was in the critical_exit() * function after setting td_critnest to 0 but before * performing a deferred preemption, this routine can be * invoked with td_critnest set to 0 and td_owepreempt true. * In that case, a critical_exit() during the rendezvous * action would trigger a preemption which is not permitted in * a rendezvous action. To fix this, wrap all of the * rendezvous action handlers in a critical section. We * cannot use a regular critical section however as having * critical_exit() preempt from this routine would also be * problematic (the preemption must not occur before the IPI * has been acknowledged via an EOI). Instead, we * intentionally ignore td_owepreempt when leaving the * critical section. This should be harmless because we do * not permit rendezvous action routines to schedule threads, * and thus td_owepreempt should never transition from 0 to 1 * during this routine. */ td = curthread; td->td_critnest++; #ifdef INVARIANTS owepreempt = td->td_owepreempt; #endif /* * If requested, run a setup function before the main action * function. Ensure all CPUs have completed the setup * function before moving on to the action function. */ if (local_setup_func != smp_no_rendezvous_barrier) { if (smp_rv_setup_func != NULL) smp_rv_setup_func(smp_rv_func_arg); atomic_add_int(&smp_rv_waiters[1], 1); while (smp_rv_waiters[1] < smp_rv_ncpus) cpu_spinwait(); } if (local_action_func != NULL) local_action_func(local_func_arg); if (local_teardown_func != smp_no_rendezvous_barrier) { /* * Signal that the main action has been completed. If a * full exit rendezvous is requested, then all CPUs will * wait here until all CPUs have finished the main action. */ atomic_add_int(&smp_rv_waiters[2], 1); while (smp_rv_waiters[2] < smp_rv_ncpus) cpu_spinwait(); if (local_teardown_func != NULL) local_teardown_func(local_func_arg); } /* * Signal that the rendezvous is fully completed by this CPU. * This means that no member of smp_rv_* pseudo-structure will be * accessed by this target CPU after this point; in particular, * memory pointed by smp_rv_func_arg. * * The release semantic ensures that all accesses performed by * the current CPU are visible when smp_rendezvous_cpus() * returns, by synchronizing with the * atomic_load_acq_int(&smp_rv_waiters[3]). */ atomic_add_rel_int(&smp_rv_waiters[3], 1); td->td_critnest--; KASSERT(owepreempt == td->td_owepreempt, ("rendezvous action changed td_owepreempt")); } void smp_rendezvous_cpus(cpuset_t map, void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { int curcpumap, i, ncpus = 0; /* Look comments in the !SMP case. */ if (!smp_started) { spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); return; } CPU_FOREACH(i) { if (CPU_ISSET(i, &map)) ncpus++; } if (ncpus == 0) panic("ncpus is 0 with non-zero map"); mtx_lock_spin(&smp_ipi_mtx); /* Pass rendezvous parameters via global variables. */ smp_rv_ncpus = ncpus; smp_rv_setup_func = setup_func; smp_rv_action_func = action_func; smp_rv_teardown_func = teardown_func; smp_rv_func_arg = arg; smp_rv_waiters[1] = 0; smp_rv_waiters[2] = 0; smp_rv_waiters[3] = 0; atomic_store_rel_int(&smp_rv_waiters[0], 0); /* * Signal other processors, which will enter the IPI with * interrupts off. */ curcpumap = CPU_ISSET(curcpu, &map); CPU_CLR(curcpu, &map); ipi_selected(map, IPI_RENDEZVOUS); /* Check if the current CPU is in the map */ if (curcpumap != 0) smp_rendezvous_action(); /* * Ensure that the master CPU waits for all the other * CPUs to finish the rendezvous, so that smp_rv_* * pseudo-structure and the arg are guaranteed to not * be in use. * * Load acquire synchronizes with the release add in * smp_rendezvous_action(), which ensures that our caller sees * all memory actions done by the called functions on other * CPUs. */ while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus) cpu_spinwait(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_rendezvous(void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1]; struct cpu_group * smp_topo(void) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; struct cpu_group *top; /* * Check for a fake topology request for debugging purposes. */ switch (smp_topology) { case 1: /* Dual core with no sharing. */ top = smp_topo_1level(CG_SHARE_NONE, 2, 0); break; case 2: /* No topology, all cpus are equal. */ top = smp_topo_none(); break; case 3: /* Dual core with shared L2. */ top = smp_topo_1level(CG_SHARE_L2, 2, 0); break; case 4: /* quad core, shared l3 among each package, private l2. */ top = smp_topo_1level(CG_SHARE_L3, 4, 0); break; case 5: /* quad core, 2 dualcore parts on each package share l2. */ top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0); break; case 6: /* Single-core 2xHTT */ top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); break; case 7: /* quad core with a shared l3, 8 threads sharing L2. */ top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, CG_FLAG_SMT); break; default: /* Default, ask the system what it wants. */ top = cpu_topo(); break; } /* * Verify the returned topology. */ if (top->cg_count != mp_ncpus) panic("Built bad topology at %p. CPU count %d != %d", top, top->cg_count, mp_ncpus); if (CPU_CMP(&top->cg_mask, &all_cpus)) panic("Built bad topology at %p. CPU mask (%s) != (%s)", top, cpusetobj_strprint(cpusetbuf, &top->cg_mask), cpusetobj_strprint(cpusetbuf2, &all_cpus)); /* * Collapse nonsense levels that may be created out of convenience by * the MD layers. They cause extra work in the search functions. */ while (top->cg_children == 1) { top = &top->cg_child[0]; top->cg_parent = NULL; } return (top); } struct cpu_group * smp_topo_alloc(u_int count) { static u_int index; u_int curr; curr = index; index += count; return (&group[curr]); } struct cpu_group * smp_topo_none(void) { struct cpu_group *top; top = &group[0]; top->cg_parent = NULL; top->cg_child = NULL; top->cg_mask = all_cpus; top->cg_count = mp_ncpus; top->cg_children = 0; top->cg_level = CG_SHARE_NONE; top->cg_flags = 0; return (top); } static int smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share, int count, int flags, int start) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; cpuset_t mask; int i; CPU_ZERO(&mask); for (i = 0; i < count; i++, start++) CPU_SET(start, &mask); child->cg_parent = parent; child->cg_child = NULL; child->cg_children = 0; child->cg_level = share; child->cg_count = count; child->cg_flags = flags; child->cg_mask = mask; parent->cg_children++; for (; parent != NULL; parent = parent->cg_parent) { if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask)) panic("Duplicate children in %p. mask (%s) child (%s)", parent, cpusetobj_strprint(cpusetbuf, &parent->cg_mask), cpusetobj_strprint(cpusetbuf2, &child->cg_mask)); CPU_OR(&parent->cg_mask, &child->cg_mask); parent->cg_count += child->cg_count; } return (start); } struct cpu_group * smp_topo_1level(int share, int count, int flags) { struct cpu_group *child; struct cpu_group *top; int packages; int cpu; int i; cpu = 0; top = &group[0]; packages = mp_ncpus / count; top->cg_child = child = &group[1]; top->cg_level = CG_SHARE_NONE; for (i = 0; i < packages; i++, child++) cpu = smp_topo_addleaf(top, child, share, count, flags, cpu); return (top); } struct cpu_group * smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags) { struct cpu_group *top; struct cpu_group *l1g; struct cpu_group *l2g; int cpu; int i; int j; cpu = 0; top = &group[0]; l2g = &group[1]; top->cg_child = l2g; top->cg_level = CG_SHARE_NONE; top->cg_children = mp_ncpus / (l2count * l1count); l1g = l2g + top->cg_children; for (i = 0; i < top->cg_children; i++, l2g++) { l2g->cg_parent = top; l2g->cg_child = l1g; l2g->cg_level = l2share; for (j = 0; j < l2count; j++, l1g++) cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count, l1flags, cpu); } return (top); } struct cpu_group * smp_topo_find(struct cpu_group *top, int cpu) { struct cpu_group *cg; cpuset_t mask; int children; int i; CPU_SETOF(cpu, &mask); cg = top; for (;;) { if (!CPU_OVERLAP(&cg->cg_mask, &mask)) return (NULL); if (cg->cg_children == 0) return (cg); children = cg->cg_children; for (i = 0, cg = cg->cg_child; i < children; cg++, i++) if (CPU_OVERLAP(&cg->cg_mask, &mask)) break; } return (NULL); } #else /* !SMP */ void smp_rendezvous_cpus(cpuset_t map, void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { /* * In the !SMP case we just need to ensure the same initial conditions * as the SMP case. */ spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); } void smp_rendezvous(void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } /* * Provide dummy SMP support for UP kernels. Modules that need to use SMP * APIs will still work using this dummy support. */ static void mp_setvariables_for_up(void *dummy) { mp_ncpus = 1; mp_maxid = PCPU_GET(cpuid); CPU_SETOF(mp_maxid, &all_cpus); KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); } SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setvariables_for_up, NULL); #endif /* SMP */ void smp_no_rendezvous_barrier(void *dummy) { #ifdef SMP KASSERT((!smp_started),("smp_no_rendezvous called and smp is started")); #endif } /* * Wait for specified idle threads to switch once. This ensures that even * preempted threads have cycled through the switch function once, * exiting their codepaths. This allows us to change global pointers * with no other synchronization. */ int quiesce_cpus(cpuset_t map, const char *wmesg, int prio) { struct pcpu *pcpu; u_int gen[MAXCPU]; int error; int cpu; error = 0; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); gen[cpu] = pcpu->pc_idlethread->td_generation; } for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); while (gen[cpu] == pcpu->pc_idlethread->td_generation) { error = tsleep(quiesce_cpus, prio, wmesg, 1); if (error != EWOULDBLOCK) goto out; error = 0; } } out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); return (error); } int quiesce_all_cpus(const char *wmesg, int prio) { return quiesce_cpus(all_cpus, wmesg, prio); } /* Extra care is taken with this sysctl because the data type is volatile */ static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS) { int error, active; active = smp_started; error = SYSCTL_OUT(req, &active, sizeof(active)); return (error); } #ifdef SMP void topo_init_node(struct topo_node *node) { bzero(node, sizeof(*node)); TAILQ_INIT(&node->children); } void topo_init_root(struct topo_node *root) { topo_init_node(root); root->type = TOPO_TYPE_SYSTEM; } /* * Add a child node with the given ID under the given parent. * Do nothing if there is already a child with that ID. */ struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype) { struct topo_node *node; TAILQ_FOREACH_REVERSE(node, &parent->children, topo_children, siblings) { if (node->hwid == hwid && node->type == type && node->subtype == subtype) { return (node); } } node = malloc(sizeof(*node), M_TOPO, M_WAITOK); topo_init_node(node); node->parent = parent; node->hwid = hwid; node->type = type; node->subtype = subtype; TAILQ_INSERT_TAIL(&parent->children, node, siblings); parent->nchildren++; return (node); } /* * Find a child node with the given ID under the given parent. */ struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype) { struct topo_node *node; TAILQ_FOREACH(node, &parent->children, siblings) { if (node->hwid == hwid && node->type == type && node->subtype == subtype) { return (node); } } return (NULL); } /* * Given a node change the order of its parent's child nodes such * that the node becomes the firt child while preserving the cyclic * order of the children. In other words, the given node is promoted * by rotation. */ void topo_promote_child(struct topo_node *child) { struct topo_node *next; struct topo_node *node; struct topo_node *parent; parent = child->parent; next = TAILQ_NEXT(child, siblings); TAILQ_REMOVE(&parent->children, child, siblings); TAILQ_INSERT_HEAD(&parent->children, child, siblings); while (next != NULL) { node = next; next = TAILQ_NEXT(node, siblings); TAILQ_REMOVE(&parent->children, node, siblings); TAILQ_INSERT_AFTER(&parent->children, child, node, siblings); child = node; } } /* * Iterate to the next node in the depth-first search (traversal) of * the topology tree. */ struct topo_node * topo_next_node(struct topo_node *top, struct topo_node *node) { struct topo_node *next; if ((next = TAILQ_FIRST(&node->children)) != NULL) return (next); if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); while (node != top && (node = node->parent) != top) if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); return (NULL); } /* * Iterate to the next node in the depth-first search of the topology tree, * but without descending below the current node. */ struct topo_node * topo_next_nonchild_node(struct topo_node *top, struct topo_node *node) { struct topo_node *next; if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); while (node != top && (node = node->parent) != top) if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); return (NULL); } /* * Assign the given ID to the given topology node that represents a logical * processor. */ void topo_set_pu_id(struct topo_node *node, cpuid_t id) { KASSERT(node->type == TOPO_TYPE_PU, ("topo_set_pu_id: wrong node type: %u", node->type)); KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0, ("topo_set_pu_id: cpuset already not empty")); node->id = id; CPU_SET(id, &node->cpuset); node->cpu_count = 1; node->subtype = 1; while ((node = node->parent) != NULL) { KASSERT(!CPU_ISSET(id, &node->cpuset), ("logical ID %u is already set in node %p", id, node)); CPU_SET(id, &node->cpuset); node->cpu_count++; } } static struct topology_spec { topo_node_type type; bool match_subtype; uintptr_t subtype; } topology_level_table[TOPO_LEVEL_COUNT] = { [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, }, [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, }, [TOPO_LEVEL_CACHEGROUP] = { .type = TOPO_TYPE_CACHE, .match_subtype = true, .subtype = CG_SHARE_L3, }, [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, }, [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, }, }; static bool topo_analyze_table(struct topo_node *root, int all, enum topo_level level, struct topo_analysis *results) { struct topology_spec *spec; struct topo_node *node; int count; if (level >= TOPO_LEVEL_COUNT) return (true); spec = &topology_level_table[level]; count = 0; node = topo_next_node(root, root); while (node != NULL) { if (node->type != spec->type || (spec->match_subtype && node->subtype != spec->subtype)) { node = topo_next_node(root, node); continue; } if (!all && CPU_EMPTY(&node->cpuset)) { node = topo_next_nonchild_node(root, node); continue; } count++; if (!topo_analyze_table(node, all, level + 1, results)) return (false); node = topo_next_nonchild_node(root, node); } /* No explicit subgroups is essentially one subgroup. */ if (count == 0) { count = 1; if (!topo_analyze_table(root, all, level + 1, results)) return (false); } if (results->entities[level] == -1) results->entities[level] = count; else if (results->entities[level] != count) return (false); return (true); } /* * Check if the topology is uniform, that is, each package has the same number * of cores in it and each core has the same number of threads (logical * processors) in it. If so, calculate the number of packages, the number of * groups per package, the number of cachegroups per group, and the number of * logical processors per cachegroup. 'all' parameter tells whether to include * administratively disabled logical processors into the analysis. */ int topo_analyze(struct topo_node *topo_root, int all, struct topo_analysis *results) { results->entities[TOPO_LEVEL_PKG] = -1; results->entities[TOPO_LEVEL_CORE] = -1; results->entities[TOPO_LEVEL_THREAD] = -1; results->entities[TOPO_LEVEL_GROUP] = -1; results->entities[TOPO_LEVEL_CACHEGROUP] = -1; if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results)) return (0); KASSERT(results->entities[TOPO_LEVEL_PKG] > 0, ("bug in topology or analysis")); return (1); } #endif /* SMP */ Index: head/sys/kern/subr_stack.c =================================================================== --- head/sys/kern/subr_stack.c (revision 326270) +++ head/sys/kern/subr_stack.c (revision 326271) @@ -1,277 +1,279 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Antoine Brodin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #ifdef KTR #include #endif #include #include #include #include #include #include FEATURE(stack, "Support for capturing kernel stack"); static MALLOC_DEFINE(M_STACK, "stack", "Stack Traces"); static int stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen, long *offset); static int stack_symbol_ddb(vm_offset_t pc, const char **name, long *offset); struct stack * stack_create(int flags) { struct stack *st; st = malloc(sizeof(*st), M_STACK, flags | M_ZERO); return (st); } void stack_destroy(struct stack *st) { free(st, M_STACK); } int stack_put(struct stack *st, vm_offset_t pc) { if (st->depth < STACK_MAX) { st->pcs[st->depth++] = pc; return (0); } else return (-1); } void stack_copy(const struct stack *src, struct stack *dst) { *dst = *src; } void stack_zero(struct stack *st) { bzero(st, sizeof *st); } void stack_print(const struct stack *st) { char namebuf[64]; long offset; int i; KASSERT(st->depth <= STACK_MAX, ("bogus stack")); for (i = 0; i < st->depth; i++) { (void)stack_symbol(st->pcs[i], namebuf, sizeof(namebuf), &offset); printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i], namebuf, offset); } } void stack_print_short(const struct stack *st) { char namebuf[64]; long offset; int i; KASSERT(st->depth <= STACK_MAX, ("bogus stack")); for (i = 0; i < st->depth; i++) { if (i > 0) printf(" "); if (stack_symbol(st->pcs[i], namebuf, sizeof(namebuf), &offset) == 0) printf("%s+%#lx", namebuf, offset); else printf("%p", (void *)st->pcs[i]); } printf("\n"); } void stack_print_ddb(const struct stack *st) { const char *name; long offset; int i; KASSERT(st->depth <= STACK_MAX, ("bogus stack")); for (i = 0; i < st->depth; i++) { stack_symbol_ddb(st->pcs[i], &name, &offset); printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i], name, offset); } } #if defined(DDB) || defined(WITNESS) void stack_print_short_ddb(const struct stack *st) { const char *name; long offset; int i; KASSERT(st->depth <= STACK_MAX, ("bogus stack")); for (i = 0; i < st->depth; i++) { if (i > 0) printf(" "); if (stack_symbol_ddb(st->pcs[i], &name, &offset) == 0) printf("%s+%#lx", name, offset); else printf("%p", (void *)st->pcs[i]); } printf("\n"); } #endif /* * Two print routines -- one for use from DDB and DDB-like contexts, the * other for use in the live kernel. */ void stack_sbuf_print(struct sbuf *sb, const struct stack *st) { char namebuf[64]; long offset; int i; KASSERT(st->depth <= STACK_MAX, ("bogus stack")); for (i = 0; i < st->depth; i++) { (void)stack_symbol(st->pcs[i], namebuf, sizeof(namebuf), &offset); sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i], namebuf, offset); } } #if defined(DDB) || defined(WITNESS) void stack_sbuf_print_ddb(struct sbuf *sb, const struct stack *st) { const char *name; long offset; int i; KASSERT(st->depth <= STACK_MAX, ("bogus stack")); for (i = 0; i < st->depth; i++) { (void)stack_symbol_ddb(st->pcs[i], &name, &offset); sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i], name, offset); } } #endif #ifdef KTR void stack_ktr(u_int mask, const char *file, int line, const struct stack *st, u_int depth, int cheap) { #ifdef DDB const char *name; long offset; int i; #endif KASSERT(st->depth <= STACK_MAX, ("bogus stack")); if (cheap) { ktr_tracepoint(mask, file, line, "#0 %p %p %p %p %p %p", st->pcs[0], st->pcs[1], st->pcs[2], st->pcs[3], st->pcs[4], st->pcs[5]); if (st->depth <= 6) return; ktr_tracepoint(mask, file, line, "#1 %p %p %p %p %p %p", st->pcs[6], st->pcs[7], st->pcs[8], st->pcs[9], st->pcs[10], st->pcs[11]); if (st->depth <= 12) return; ktr_tracepoint(mask, file, line, "#2 %p %p %p %p %p %p", st->pcs[12], st->pcs[13], st->pcs[14], st->pcs[15], st->pcs[16], st->pcs[17]); #ifdef DDB } else { if (depth == 0 || st->depth < depth) depth = st->depth; for (i = 0; i < depth; i++) { (void)stack_symbol_ddb(st->pcs[i], &name, &offset); ktr_tracepoint(mask, file, line, "#%d %p at %s+%#lx", i, st->pcs[i], (u_long)name, offset, 0, 0); } #endif } } #endif /* * Two variants of stack symbol lookup -- one that uses the DDB interfaces * and bypasses linker locking, and the other that doesn't. */ static int stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen, long *offset) { if (linker_search_symbol_name((caddr_t)pc, namebuf, buflen, offset) != 0) { *offset = 0; strlcpy(namebuf, "??", buflen); return (ENOENT); } else return (0); } static int stack_symbol_ddb(vm_offset_t pc, const char **name, long *offset) { linker_symval_t symval; c_linker_sym_t sym; if (linker_ddb_search_symbol((caddr_t)pc, &sym, offset) != 0) goto out; if (linker_ddb_symbol_values(sym, &symval) != 0) goto out; if (symval.name != NULL) { *name = symval.name; return (0); } out: *offset = 0; *name = "??"; return (ENOENT); } Index: head/sys/kern/subr_taskqueue.c =================================================================== --- head/sys/kern/subr_taskqueue.c (revision 326270) +++ head/sys/kern/subr_taskqueue.c (revision 326271) @@ -1,844 +1,846 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues"); static void *taskqueue_giant_ih; static void *taskqueue_ih; static void taskqueue_fast_enqueue(void *); static void taskqueue_swi_enqueue(void *); static void taskqueue_swi_giant_enqueue(void *); struct taskqueue_busy { struct task *tb_running; TAILQ_ENTRY(taskqueue_busy) tb_link; }; struct task * const TB_DRAIN_WAITER = (struct task *)0x1; struct taskqueue { STAILQ_HEAD(, task) tq_queue; taskqueue_enqueue_fn tq_enqueue; void *tq_context; char *tq_name; TAILQ_HEAD(, taskqueue_busy) tq_active; struct mtx tq_mutex; struct thread **tq_threads; int tq_tcount; int tq_spin; int tq_flags; int tq_callouts; taskqueue_callback_fn tq_callbacks[TASKQUEUE_NUM_CALLBACKS]; void *tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS]; }; #define TQ_FLAGS_ACTIVE (1 << 0) #define TQ_FLAGS_BLOCKED (1 << 1) #define TQ_FLAGS_UNLOCKED_ENQUEUE (1 << 2) #define DT_CALLOUT_ARMED (1 << 0) #define DT_DRAIN_IN_PROGRESS (1 << 1) #define TQ_LOCK(tq) \ do { \ if ((tq)->tq_spin) \ mtx_lock_spin(&(tq)->tq_mutex); \ else \ mtx_lock(&(tq)->tq_mutex); \ } while (0) #define TQ_ASSERT_LOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_OWNED) #define TQ_UNLOCK(tq) \ do { \ if ((tq)->tq_spin) \ mtx_unlock_spin(&(tq)->tq_mutex); \ else \ mtx_unlock(&(tq)->tq_mutex); \ } while (0) #define TQ_ASSERT_UNLOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED) void _timeout_task_init(struct taskqueue *queue, struct timeout_task *timeout_task, int priority, task_fn_t func, void *context) { TASK_INIT(&timeout_task->t, priority, func, context); callout_init_mtx(&timeout_task->c, &queue->tq_mutex, CALLOUT_RETURNUNLOCKED); timeout_task->q = queue; timeout_task->f = 0; } static __inline int TQ_SLEEP(struct taskqueue *tq, void *p, struct mtx *m, int pri, const char *wm, int t) { if (tq->tq_spin) return (msleep_spin(p, m, wm, t)); return (msleep(p, m, pri, wm, t)); } static struct taskqueue * _taskqueue_create(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context, int mtxflags, const char *mtxname __unused) { struct taskqueue *queue; char *tq_name; tq_name = malloc(TASKQUEUE_NAMELEN, M_TASKQUEUE, mflags | M_ZERO); if (tq_name == NULL) return (NULL); queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO); if (queue == NULL) { free(tq_name, M_TASKQUEUE); return (NULL); } snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue"); STAILQ_INIT(&queue->tq_queue); TAILQ_INIT(&queue->tq_active); queue->tq_enqueue = enqueue; queue->tq_context = context; queue->tq_name = tq_name; queue->tq_spin = (mtxflags & MTX_SPIN) != 0; queue->tq_flags |= TQ_FLAGS_ACTIVE; if (enqueue == taskqueue_fast_enqueue || enqueue == taskqueue_swi_enqueue || enqueue == taskqueue_swi_giant_enqueue || enqueue == taskqueue_thread_enqueue) queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE; mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags); return (queue); } struct taskqueue * taskqueue_create(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context) { return _taskqueue_create(name, mflags, enqueue, context, MTX_DEF, name); } void taskqueue_set_callback(struct taskqueue *queue, enum taskqueue_callback_type cb_type, taskqueue_callback_fn callback, void *context) { KASSERT(((cb_type >= TASKQUEUE_CALLBACK_TYPE_MIN) && (cb_type <= TASKQUEUE_CALLBACK_TYPE_MAX)), ("Callback type %d not valid, must be %d-%d", cb_type, TASKQUEUE_CALLBACK_TYPE_MIN, TASKQUEUE_CALLBACK_TYPE_MAX)); KASSERT((queue->tq_callbacks[cb_type] == NULL), ("Re-initialization of taskqueue callback?")); queue->tq_callbacks[cb_type] = callback; queue->tq_cb_contexts[cb_type] = context; } /* * Signal a taskqueue thread to terminate. */ static void taskqueue_terminate(struct thread **pp, struct taskqueue *tq) { while (tq->tq_tcount > 0 || tq->tq_callouts > 0) { wakeup(tq); TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0); } } void taskqueue_free(struct taskqueue *queue) { TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_ACTIVE; taskqueue_terminate(queue->tq_threads, queue); KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?")); KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks")); mtx_destroy(&queue->tq_mutex); free(queue->tq_threads, M_TASKQUEUE); free(queue->tq_name, M_TASKQUEUE); free(queue, M_TASKQUEUE); } static int taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task) { struct task *ins; struct task *prev; KASSERT(task->ta_func != NULL, ("enqueueing task with NULL func")); /* * Count multiple enqueues. */ if (task->ta_pending) { if (task->ta_pending < USHRT_MAX) task->ta_pending++; TQ_UNLOCK(queue); return (0); } /* * Optimise the case when all tasks have the same priority. */ prev = STAILQ_LAST(&queue->tq_queue, task, ta_link); if (!prev || prev->ta_priority >= task->ta_priority) { STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link); } else { prev = NULL; for (ins = STAILQ_FIRST(&queue->tq_queue); ins; prev = ins, ins = STAILQ_NEXT(ins, ta_link)) if (ins->ta_priority < task->ta_priority) break; if (prev) STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link); else STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link); } task->ta_pending = 1; if ((queue->tq_flags & TQ_FLAGS_UNLOCKED_ENQUEUE) != 0) TQ_UNLOCK(queue); if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0) queue->tq_enqueue(queue->tq_context); if ((queue->tq_flags & TQ_FLAGS_UNLOCKED_ENQUEUE) == 0) TQ_UNLOCK(queue); /* Return with lock released. */ return (0); } int taskqueue_enqueue(struct taskqueue *queue, struct task *task) { int res; TQ_LOCK(queue); res = taskqueue_enqueue_locked(queue, task); /* The lock is released inside. */ return (res); } static void taskqueue_timeout_func(void *arg) { struct taskqueue *queue; struct timeout_task *timeout_task; timeout_task = arg; queue = timeout_task->q; KASSERT((timeout_task->f & DT_CALLOUT_ARMED) != 0, ("Stray timeout")); timeout_task->f &= ~DT_CALLOUT_ARMED; queue->tq_callouts--; taskqueue_enqueue_locked(timeout_task->q, &timeout_task->t); /* The lock is released inside. */ } int taskqueue_enqueue_timeout_sbt(struct taskqueue *queue, struct timeout_task *timeout_task, sbintime_t sbt, sbintime_t pr, int flags) { int res; TQ_LOCK(queue); KASSERT(timeout_task->q == NULL || timeout_task->q == queue, ("Migrated queue")); KASSERT(!queue->tq_spin, ("Timeout for spin-queue")); timeout_task->q = queue; res = timeout_task->t.ta_pending; if (timeout_task->f & DT_DRAIN_IN_PROGRESS) { /* Do nothing */ TQ_UNLOCK(queue); res = -1; } else if (sbt == 0) { taskqueue_enqueue_locked(queue, &timeout_task->t); /* The lock is released inside. */ } else { if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) { res++; } else { queue->tq_callouts++; timeout_task->f |= DT_CALLOUT_ARMED; if (sbt < 0) sbt = -sbt; /* Ignore overflow. */ } if (sbt > 0) { callout_reset_sbt(&timeout_task->c, sbt, pr, taskqueue_timeout_func, timeout_task, flags); } TQ_UNLOCK(queue); } return (res); } int taskqueue_enqueue_timeout(struct taskqueue *queue, struct timeout_task *ttask, int ticks) { return (taskqueue_enqueue_timeout_sbt(queue, ttask, ticks * tick_sbt, 0, 0)); } static void taskqueue_task_nop_fn(void *context, int pending) { } /* * Block until all currently queued tasks in this taskqueue * have begun execution. Tasks queued during execution of * this function are ignored. */ static void taskqueue_drain_tq_queue(struct taskqueue *queue) { struct task t_barrier; if (STAILQ_EMPTY(&queue->tq_queue)) return; /* * Enqueue our barrier after all current tasks, but with * the highest priority so that newly queued tasks cannot * pass it. Because of the high priority, we can not use * taskqueue_enqueue_locked directly (which drops the lock * anyway) so just insert it at tail while we have the * queue lock. */ TASK_INIT(&t_barrier, USHRT_MAX, taskqueue_task_nop_fn, &t_barrier); STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link); t_barrier.ta_pending = 1; /* * Once the barrier has executed, all previously queued tasks * have completed or are currently executing. */ while (t_barrier.ta_pending != 0) TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0); } /* * Block until all currently executing tasks for this taskqueue * complete. Tasks that begin execution during the execution * of this function are ignored. */ static void taskqueue_drain_tq_active(struct taskqueue *queue) { struct taskqueue_busy tb_marker, *tb_first; if (TAILQ_EMPTY(&queue->tq_active)) return; /* Block taskq_terminate().*/ queue->tq_callouts++; /* * Wait for all currently executing taskqueue threads * to go idle. */ tb_marker.tb_running = TB_DRAIN_WAITER; TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link); while (TAILQ_FIRST(&queue->tq_active) != &tb_marker) TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0); TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link); /* * Wakeup any other drain waiter that happened to queue up * without any intervening active thread. */ tb_first = TAILQ_FIRST(&queue->tq_active); if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) wakeup(tb_first); /* Release taskqueue_terminate(). */ queue->tq_callouts--; if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0) wakeup_one(queue->tq_threads); } void taskqueue_block(struct taskqueue *queue) { TQ_LOCK(queue); queue->tq_flags |= TQ_FLAGS_BLOCKED; TQ_UNLOCK(queue); } void taskqueue_unblock(struct taskqueue *queue) { TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_BLOCKED; if (!STAILQ_EMPTY(&queue->tq_queue)) queue->tq_enqueue(queue->tq_context); TQ_UNLOCK(queue); } static void taskqueue_run_locked(struct taskqueue *queue) { struct taskqueue_busy tb; struct taskqueue_busy *tb_first; struct task *task; int pending; KASSERT(queue != NULL, ("tq is NULL")); TQ_ASSERT_LOCKED(queue); tb.tb_running = NULL; while (STAILQ_FIRST(&queue->tq_queue)) { TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link); /* * Carefully remove the first task from the queue and * zero its pending count. */ task = STAILQ_FIRST(&queue->tq_queue); KASSERT(task != NULL, ("task is NULL")); STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link); pending = task->ta_pending; task->ta_pending = 0; tb.tb_running = task; TQ_UNLOCK(queue); KASSERT(task->ta_func != NULL, ("task->ta_func is NULL")); task->ta_func(task->ta_context, pending); TQ_LOCK(queue); tb.tb_running = NULL; wakeup(task); TAILQ_REMOVE(&queue->tq_active, &tb, tb_link); tb_first = TAILQ_FIRST(&queue->tq_active); if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) wakeup(tb_first); } } void taskqueue_run(struct taskqueue *queue) { TQ_LOCK(queue); taskqueue_run_locked(queue); TQ_UNLOCK(queue); } static int task_is_running(struct taskqueue *queue, struct task *task) { struct taskqueue_busy *tb; TQ_ASSERT_LOCKED(queue); TAILQ_FOREACH(tb, &queue->tq_active, tb_link) { if (tb->tb_running == task) return (1); } return (0); } /* * Only use this function in single threaded contexts. It returns * non-zero if the given task is either pending or running. Else the * task is idle and can be queued again or freed. */ int taskqueue_poll_is_busy(struct taskqueue *queue, struct task *task) { int retval; TQ_LOCK(queue); retval = task->ta_pending > 0 || task_is_running(queue, task); TQ_UNLOCK(queue); return (retval); } static int taskqueue_cancel_locked(struct taskqueue *queue, struct task *task, u_int *pendp) { if (task->ta_pending > 0) STAILQ_REMOVE(&queue->tq_queue, task, task, ta_link); if (pendp != NULL) *pendp = task->ta_pending; task->ta_pending = 0; return (task_is_running(queue, task) ? EBUSY : 0); } int taskqueue_cancel(struct taskqueue *queue, struct task *task, u_int *pendp) { int error; TQ_LOCK(queue); error = taskqueue_cancel_locked(queue, task, pendp); TQ_UNLOCK(queue); return (error); } int taskqueue_cancel_timeout(struct taskqueue *queue, struct timeout_task *timeout_task, u_int *pendp) { u_int pending, pending1; int error; TQ_LOCK(queue); pending = !!(callout_stop(&timeout_task->c) > 0); error = taskqueue_cancel_locked(queue, &timeout_task->t, &pending1); if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) { timeout_task->f &= ~DT_CALLOUT_ARMED; queue->tq_callouts--; } TQ_UNLOCK(queue); if (pendp != NULL) *pendp = pending + pending1; return (error); } void taskqueue_drain(struct taskqueue *queue, struct task *task) { if (!queue->tq_spin) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); TQ_LOCK(queue); while (task->ta_pending != 0 || task_is_running(queue, task)) TQ_SLEEP(queue, task, &queue->tq_mutex, PWAIT, "-", 0); TQ_UNLOCK(queue); } void taskqueue_drain_all(struct taskqueue *queue) { if (!queue->tq_spin) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); TQ_LOCK(queue); taskqueue_drain_tq_queue(queue); taskqueue_drain_tq_active(queue); TQ_UNLOCK(queue); } void taskqueue_drain_timeout(struct taskqueue *queue, struct timeout_task *timeout_task) { /* * Set flag to prevent timer from re-starting during drain: */ TQ_LOCK(queue); KASSERT((timeout_task->f & DT_DRAIN_IN_PROGRESS) == 0, ("Drain already in progress")); timeout_task->f |= DT_DRAIN_IN_PROGRESS; TQ_UNLOCK(queue); callout_drain(&timeout_task->c); taskqueue_drain(queue, &timeout_task->t); /* * Clear flag to allow timer to re-start: */ TQ_LOCK(queue); timeout_task->f &= ~DT_DRAIN_IN_PROGRESS; TQ_UNLOCK(queue); } static void taskqueue_swi_enqueue(void *context) { swi_sched(taskqueue_ih, 0); } static void taskqueue_swi_run(void *dummy) { taskqueue_run(taskqueue_swi); } static void taskqueue_swi_giant_enqueue(void *context) { swi_sched(taskqueue_giant_ih, 0); } static void taskqueue_swi_giant_run(void *dummy) { taskqueue_run(taskqueue_swi_giant); } static int _taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, cpuset_t *mask, const char *name, va_list ap) { char ktname[MAXCOMLEN + 1]; struct thread *td; struct taskqueue *tq; int i, error; if (count <= 0) return (EINVAL); vsnprintf(ktname, sizeof(ktname), name, ap); tq = *tqp; tq->tq_threads = malloc(sizeof(struct thread *) * count, M_TASKQUEUE, M_NOWAIT | M_ZERO); if (tq->tq_threads == NULL) { printf("%s: no memory for %s threads\n", __func__, ktname); return (ENOMEM); } for (i = 0; i < count; i++) { if (count == 1) error = kthread_add(taskqueue_thread_loop, tqp, NULL, &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname); else error = kthread_add(taskqueue_thread_loop, tqp, NULL, &tq->tq_threads[i], RFSTOPPED, 0, "%s_%d", ktname, i); if (error) { /* should be ok to continue, taskqueue_free will dtrt */ printf("%s: kthread_add(%s): error %d", __func__, ktname, error); tq->tq_threads[i] = NULL; /* paranoid */ } else tq->tq_tcount++; } if (tq->tq_tcount == 0) { free(tq->tq_threads, M_TASKQUEUE); tq->tq_threads = NULL; return (ENOMEM); } for (i = 0; i < count; i++) { if (tq->tq_threads[i] == NULL) continue; td = tq->tq_threads[i]; if (mask) { error = cpuset_setthread(td->td_tid, mask); /* * Failing to pin is rarely an actual fatal error; * it'll just affect performance. */ if (error) printf("%s: curthread=%llu: can't pin; " "error=%d\n", __func__, (unsigned long long) td->td_tid, error); } thread_lock(td); sched_prio(td, pri); sched_add(td, SRQ_BORING); thread_unlock(td); } return (0); } int taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, const char *name, ...) { va_list ap; int error; va_start(ap, name); error = _taskqueue_start_threads(tqp, count, pri, NULL, name, ap); va_end(ap); return (error); } int taskqueue_start_threads_cpuset(struct taskqueue **tqp, int count, int pri, cpuset_t *mask, const char *name, ...) { va_list ap; int error; va_start(ap, name); error = _taskqueue_start_threads(tqp, count, pri, mask, name, ap); va_end(ap); return (error); } static inline void taskqueue_run_callback(struct taskqueue *tq, enum taskqueue_callback_type cb_type) { taskqueue_callback_fn tq_callback; TQ_ASSERT_UNLOCKED(tq); tq_callback = tq->tq_callbacks[cb_type]; if (tq_callback != NULL) tq_callback(tq->tq_cb_contexts[cb_type]); } void taskqueue_thread_loop(void *arg) { struct taskqueue **tqp, *tq; tqp = arg; tq = *tqp; taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT); TQ_LOCK(tq); while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) { /* XXX ? */ taskqueue_run_locked(tq); /* * Because taskqueue_run() can drop tq_mutex, we need to * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the * meantime, which means we missed a wakeup. */ if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0) break; TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0); } taskqueue_run_locked(tq); /* * This thread is on its way out, so just drop the lock temporarily * in order to call the shutdown callback. This allows the callback * to look at the taskqueue, even just before it dies. */ TQ_UNLOCK(tq); taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); TQ_LOCK(tq); /* rendezvous with thread that asked us to terminate */ tq->tq_tcount--; wakeup_one(tq->tq_threads); TQ_UNLOCK(tq); kthread_exit(); } void taskqueue_thread_enqueue(void *context) { struct taskqueue **tqp, *tq; tqp = context; tq = *tqp; wakeup_one(tq); } TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL, swi_add(NULL, "task queue", taskqueue_swi_run, NULL, SWI_TQ, INTR_MPSAFE, &taskqueue_ih)); TASKQUEUE_DEFINE(swi_giant, taskqueue_swi_giant_enqueue, NULL, swi_add(NULL, "Giant taskq", taskqueue_swi_giant_run, NULL, SWI_TQ_GIANT, 0, &taskqueue_giant_ih)); TASKQUEUE_DEFINE_THREAD(thread); struct taskqueue * taskqueue_create_fast(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context) { return _taskqueue_create(name, mflags, enqueue, context, MTX_SPIN, "fast_taskqueue"); } static void *taskqueue_fast_ih; static void taskqueue_fast_enqueue(void *context) { swi_sched(taskqueue_fast_ih, 0); } static void taskqueue_fast_run(void *dummy) { taskqueue_run(taskqueue_fast); } TASKQUEUE_FAST_DEFINE(fast, taskqueue_fast_enqueue, NULL, swi_add(NULL, "fast taskq", taskqueue_fast_run, NULL, SWI_TQ_FAST, INTR_MPSAFE, &taskqueue_fast_ih)); int taskqueue_member(struct taskqueue *queue, struct thread *td) { int i, j, ret = 0; for (i = 0, j = 0; ; i++) { if (queue->tq_threads[i] == NULL) continue; if (queue->tq_threads[i] == td) { ret = 1; break; } if (++j >= queue->tq_tcount) break; } return (ret); } Index: head/sys/kern/subr_terminal.c =================================================================== --- head/sys/kern/subr_terminal.c (revision 326270) +++ head/sys/kern/subr_terminal.c (revision 326271) @@ -1,658 +1,660 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009 The FreeBSD Foundation * All rights reserved. * * This software was developed by Ed Schouten under sponsorship from the * FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_TERMINAL, "terminal", "terminal device"); /* * Locking. * * Normally we don't need to lock down the terminal emulator, because * the TTY lock is already held when calling teken_input(). * Unfortunately this is not the case when the terminal acts as a * console device, because cnputc() can be called at the same time. * This means terminals may need to be locked down using a spin lock. */ #define TERMINAL_LOCK(tm) do { \ if ((tm)->tm_flags & TF_CONS) \ mtx_lock_spin(&(tm)->tm_mtx); \ else if ((tm)->tm_tty != NULL) \ tty_lock((tm)->tm_tty); \ } while (0) #define TERMINAL_UNLOCK(tm) do { \ if ((tm)->tm_flags & TF_CONS) \ mtx_unlock_spin(&(tm)->tm_mtx); \ else if ((tm)->tm_tty != NULL) \ tty_unlock((tm)->tm_tty); \ } while (0) #define TERMINAL_LOCK_TTY(tm) do { \ if ((tm)->tm_flags & TF_CONS) \ mtx_lock_spin(&(tm)->tm_mtx); \ } while (0) #define TERMINAL_UNLOCK_TTY(tm) do { \ if ((tm)->tm_flags & TF_CONS) \ mtx_unlock_spin(&(tm)->tm_mtx); \ } while (0) #define TERMINAL_LOCK_CONS(tm) mtx_lock_spin(&(tm)->tm_mtx) #define TERMINAL_UNLOCK_CONS(tm) mtx_unlock_spin(&(tm)->tm_mtx) /* * TTY routines. */ static tsw_open_t termtty_open; static tsw_close_t termtty_close; static tsw_outwakeup_t termtty_outwakeup; static tsw_ioctl_t termtty_ioctl; static tsw_mmap_t termtty_mmap; static struct ttydevsw terminal_tty_class = { .tsw_open = termtty_open, .tsw_close = termtty_close, .tsw_outwakeup = termtty_outwakeup, .tsw_ioctl = termtty_ioctl, .tsw_mmap = termtty_mmap, }; /* * Terminal emulator routines. */ static tf_bell_t termteken_bell; static tf_cursor_t termteken_cursor; static tf_putchar_t termteken_putchar; static tf_fill_t termteken_fill; static tf_copy_t termteken_copy; static tf_param_t termteken_param; static tf_respond_t termteken_respond; static teken_funcs_t terminal_drawmethods = { .tf_bell = termteken_bell, .tf_cursor = termteken_cursor, .tf_putchar = termteken_putchar, .tf_fill = termteken_fill, .tf_copy = termteken_copy, .tf_param = termteken_param, .tf_respond = termteken_respond, }; /* Kernel message formatting. */ static const teken_attr_t kernel_message = { .ta_fgcolor = TCHAR_FGCOLOR(TERMINAL_KERN_ATTR), .ta_bgcolor = TCHAR_BGCOLOR(TERMINAL_KERN_ATTR), .ta_format = TCHAR_FORMAT(TERMINAL_KERN_ATTR) }; static const teken_attr_t default_message = { .ta_fgcolor = TCHAR_FGCOLOR(TERMINAL_NORM_ATTR), .ta_bgcolor = TCHAR_BGCOLOR(TERMINAL_NORM_ATTR), .ta_format = TCHAR_FORMAT(TERMINAL_NORM_ATTR) }; /* Fudge fg brightness as TF_BOLD (shifted). */ #define TCOLOR_FG_FUDGED(color) __extension__ ({ \ teken_color_t _c; \ \ _c = (color); \ TCOLOR_FG(_c & 7) | ((_c & 8) << 18); \ }) /* Fudge bg brightness as TF_BLINK (shifted). */ #define TCOLOR_BG_FUDGED(color) __extension__ ({ \ teken_color_t _c; \ \ _c = (color); \ TCOLOR_BG(_c & 7) | ((_c & 8) << 20); \ }) #define TCOLOR_256TO16(color) __extension__ ({ \ teken_color_t _c; \ \ _c = (color); \ if (_c >= 16) \ _c = teken_256to16(_c); \ _c; \ }) #define TCHAR_CREATE(c, a) ((c) | TFORMAT((a)->ta_format) | \ TCOLOR_FG_FUDGED(TCOLOR_256TO16((a)->ta_fgcolor)) | \ TCOLOR_BG_FUDGED(TCOLOR_256TO16((a)->ta_bgcolor))) static void terminal_init(struct terminal *tm) { if (tm->tm_flags & TF_CONS) mtx_init(&tm->tm_mtx, "trmlck", NULL, MTX_SPIN); teken_init(&tm->tm_emulator, &terminal_drawmethods, tm); teken_set_defattr(&tm->tm_emulator, &default_message); } struct terminal * terminal_alloc(const struct terminal_class *tc, void *softc) { struct terminal *tm; tm = malloc(sizeof(struct terminal), M_TERMINAL, M_WAITOK|M_ZERO); terminal_init(tm); tm->tm_class = tc; tm->tm_softc = softc; return (tm); } static void terminal_sync_ttysize(struct terminal *tm) { struct tty *tp; tp = tm->tm_tty; if (tp == NULL) return; tty_lock(tp); tty_set_winsize(tp, &tm->tm_winsize); tty_unlock(tp); } void terminal_maketty(struct terminal *tm, const char *fmt, ...) { struct tty *tp; char name[8]; va_list ap; va_start(ap, fmt); vsnrprintf(name, sizeof name, 32, fmt, ap); va_end(ap); tp = tty_alloc(&terminal_tty_class, tm); tty_makedev(tp, NULL, "%s", name); tm->tm_tty = tp; terminal_sync_ttysize(tm); } void terminal_set_cursor(struct terminal *tm, const term_pos_t *pos) { teken_set_cursor(&tm->tm_emulator, pos); } void terminal_set_winsize_blank(struct terminal *tm, const struct winsize *size, int blank, const term_attr_t *attr) { term_rect_t r; tm->tm_winsize = *size; r.tr_begin.tp_row = r.tr_begin.tp_col = 0; r.tr_end.tp_row = size->ws_row; r.tr_end.tp_col = size->ws_col; TERMINAL_LOCK(tm); if (blank == 0) teken_set_winsize_noreset(&tm->tm_emulator, &r.tr_end); else teken_set_winsize(&tm->tm_emulator, &r.tr_end); TERMINAL_UNLOCK(tm); if ((blank != 0) && !(tm->tm_flags & TF_MUTE)) tm->tm_class->tc_fill(tm, &r, TCHAR_CREATE((teken_char_t)' ', attr)); terminal_sync_ttysize(tm); } void terminal_set_winsize(struct terminal *tm, const struct winsize *size) { terminal_set_winsize_blank(tm, size, 1, (const term_attr_t *)&default_message); } /* * XXX: This function is a kludge. Drivers like vt(4) need to * temporarily stop input when resizing, etc. This should ideally be * handled within the driver. */ void terminal_mute(struct terminal *tm, int yes) { TERMINAL_LOCK(tm); if (yes) tm->tm_flags |= TF_MUTE; else tm->tm_flags &= ~TF_MUTE; TERMINAL_UNLOCK(tm); } void terminal_input_char(struct terminal *tm, term_char_t c) { struct tty *tp; tp = tm->tm_tty; if (tp == NULL) return; /* * Strip off any attributes. Also ignore input of second part of * CJK fullwidth characters, as we don't want to return these * characters twice. */ if (TCHAR_FORMAT(c) & TF_CJK_RIGHT) return; c = TCHAR_CHARACTER(c); tty_lock(tp); /* * Conversion to UTF-8. */ if (c < 0x80) { ttydisc_rint(tp, c, 0); } else if (c < 0x800) { char str[2] = { 0xc0 | (c >> 6), 0x80 | (c & 0x3f) }; ttydisc_rint_simple(tp, str, sizeof str); } else if (c < 0x10000) { char str[3] = { 0xe0 | (c >> 12), 0x80 | ((c >> 6) & 0x3f), 0x80 | (c & 0x3f) }; ttydisc_rint_simple(tp, str, sizeof str); } else { char str[4] = { 0xf0 | (c >> 18), 0x80 | ((c >> 12) & 0x3f), 0x80 | ((c >> 6) & 0x3f), 0x80 | (c & 0x3f) }; ttydisc_rint_simple(tp, str, sizeof str); } ttydisc_rint_done(tp); tty_unlock(tp); } void terminal_input_raw(struct terminal *tm, char c) { struct tty *tp; tp = tm->tm_tty; if (tp == NULL) return; tty_lock(tp); ttydisc_rint(tp, c, 0); ttydisc_rint_done(tp); tty_unlock(tp); } void terminal_input_special(struct terminal *tm, unsigned int k) { struct tty *tp; const char *str; tp = tm->tm_tty; if (tp == NULL) return; str = teken_get_sequence(&tm->tm_emulator, k); if (str == NULL) return; tty_lock(tp); ttydisc_rint_simple(tp, str, strlen(str)); ttydisc_rint_done(tp); tty_unlock(tp); } /* * Binding with the TTY layer. */ static int termtty_open(struct tty *tp) { struct terminal *tm = tty_softc(tp); tm->tm_class->tc_opened(tm, 1); return (0); } static void termtty_close(struct tty *tp) { struct terminal *tm = tty_softc(tp); tm->tm_class->tc_opened(tm, 0); } static void termtty_outwakeup(struct tty *tp) { struct terminal *tm = tty_softc(tp); char obuf[128]; size_t olen; unsigned int flags = 0; while ((olen = ttydisc_getc(tp, obuf, sizeof obuf)) > 0) { TERMINAL_LOCK_TTY(tm); if (!(tm->tm_flags & TF_MUTE)) { tm->tm_flags &= ~TF_BELL; teken_input(&tm->tm_emulator, obuf, olen); flags |= tm->tm_flags; } TERMINAL_UNLOCK_TTY(tm); } TERMINAL_LOCK_TTY(tm); if (!(tm->tm_flags & TF_MUTE)) tm->tm_class->tc_done(tm); TERMINAL_UNLOCK_TTY(tm); if (flags & TF_BELL) tm->tm_class->tc_bell(tm); } static int termtty_ioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td) { struct terminal *tm = tty_softc(tp); int error; switch (cmd) { case CONS_GETINFO: { vid_info_t *vi = (vid_info_t *)data; const teken_pos_t *p; int fg, bg; if (vi->size != sizeof(vid_info_t)) return (EINVAL); /* Already help the console driver by filling in some data. */ p = teken_get_cursor(&tm->tm_emulator); vi->mv_row = p->tp_row; vi->mv_col = p->tp_col; p = teken_get_winsize(&tm->tm_emulator); vi->mv_rsz = p->tp_row; vi->mv_csz = p->tp_col; teken_get_defattr_cons25(&tm->tm_emulator, &fg, &bg); vi->mv_norm.fore = fg; vi->mv_norm.back = bg; /* XXX: keep vidcontrol happy; bold backgrounds. */ vi->mv_rev.fore = bg; vi->mv_rev.back = fg & 0x7; break; } } /* * Unlike various other drivers, this driver will never * deallocate TTYs. This means it's safe to temporarily unlock * the TTY when handling ioctls. */ tty_unlock(tp); error = tm->tm_class->tc_ioctl(tm, cmd, data, td); tty_lock(tp); return (error); } static int termtty_mmap(struct tty *tp, vm_ooffset_t offset, vm_paddr_t * paddr, int nprot, vm_memattr_t *memattr) { struct terminal *tm = tty_softc(tp); return (tm->tm_class->tc_mmap(tm, offset, paddr, nprot, memattr)); } /* * Binding with the kernel and debug console. */ static cn_probe_t termcn_cnprobe; static cn_init_t termcn_cninit; static cn_term_t termcn_cnterm; static cn_getc_t termcn_cngetc; static cn_putc_t termcn_cnputc; static cn_grab_t termcn_cngrab; static cn_ungrab_t termcn_cnungrab; const struct consdev_ops termcn_cnops = { .cn_probe = termcn_cnprobe, .cn_init = termcn_cninit, .cn_term = termcn_cnterm, .cn_getc = termcn_cngetc, .cn_putc = termcn_cnputc, .cn_grab = termcn_cngrab, .cn_ungrab = termcn_cnungrab, }; void termcn_cnregister(struct terminal *tm) { struct consdev *cp; cp = tm->consdev; if (cp == NULL) { cp = malloc(sizeof(struct consdev), M_TERMINAL, M_WAITOK|M_ZERO); cp->cn_ops = &termcn_cnops; cp->cn_arg = tm; cp->cn_pri = CN_INTERNAL; sprintf(cp->cn_name, "ttyv0"); tm->tm_flags = TF_CONS; tm->consdev = cp; terminal_init(tm); } /* Attach terminal as console. */ cnadd(cp); } static void termcn_cngrab(struct consdev *cp) { struct terminal *tm = cp->cn_arg; tm->tm_class->tc_cngrab(tm); } static void termcn_cnungrab(struct consdev *cp) { struct terminal *tm = cp->cn_arg; tm->tm_class->tc_cnungrab(tm); } static void termcn_cnprobe(struct consdev *cp) { struct terminal *tm = cp->cn_arg; if (tm == NULL) { cp->cn_pri = CN_DEAD; return; } tm->consdev = cp; terminal_init(tm); tm->tm_class->tc_cnprobe(tm, cp); } static void termcn_cninit(struct consdev *cp) { } static void termcn_cnterm(struct consdev *cp) { } static int termcn_cngetc(struct consdev *cp) { struct terminal *tm = cp->cn_arg; return (tm->tm_class->tc_cngetc(tm)); } static void termcn_cnputc(struct consdev *cp, int c) { struct terminal *tm = cp->cn_arg; teken_attr_t backup; char cv = c; TERMINAL_LOCK_CONS(tm); if (!(tm->tm_flags & TF_MUTE)) { backup = *teken_get_curattr(&tm->tm_emulator); teken_set_curattr(&tm->tm_emulator, &kernel_message); teken_input(&tm->tm_emulator, &cv, 1); teken_set_curattr(&tm->tm_emulator, &backup); tm->tm_class->tc_done(tm); } TERMINAL_UNLOCK_CONS(tm); } /* * Binding with the terminal emulator. */ static void termteken_bell(void *softc) { struct terminal *tm = softc; tm->tm_flags |= TF_BELL; } static void termteken_cursor(void *softc, const teken_pos_t *p) { struct terminal *tm = softc; tm->tm_class->tc_cursor(tm, p); } static void termteken_putchar(void *softc, const teken_pos_t *p, teken_char_t c, const teken_attr_t *a) { struct terminal *tm = softc; tm->tm_class->tc_putchar(tm, p, TCHAR_CREATE(c, a)); } static void termteken_fill(void *softc, const teken_rect_t *r, teken_char_t c, const teken_attr_t *a) { struct terminal *tm = softc; tm->tm_class->tc_fill(tm, r, TCHAR_CREATE(c, a)); } static void termteken_copy(void *softc, const teken_rect_t *r, const teken_pos_t *p) { struct terminal *tm = softc; tm->tm_class->tc_copy(tm, r, p); } static void termteken_param(void *softc, int cmd, unsigned int arg) { struct terminal *tm = softc; tm->tm_class->tc_param(tm, cmd, arg); } static void termteken_respond(void *softc, const void *buf, size_t len) { #if 0 struct terminal *tm = softc; struct tty *tp; /* * Only inject a response into the TTY if the data actually * originated from the TTY. * * XXX: This cannot be done right now. The TTY could pick up * other locks. It could also in theory cause loops, when the * TTY performs echoing of a command that generates even more * input. */ tp = tm->tm_tty; if (tp == NULL) return; ttydisc_rint_simple(tp, buf, len); ttydisc_rint_done(tp); #endif } Index: head/sys/kern/subr_turnstile.c =================================================================== --- head/sys/kern/subr_turnstile.c (revision 326270) +++ head/sys/kern/subr_turnstile.c (revision 326271) @@ -1,1254 +1,1256 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of turnstiles used to hold queue of threads blocked on * non-sleepable locks. Sleepable locks use condition variables to * implement their queues. Turnstiles differ from a sleep queue in that * turnstile queue's are assigned to a lock held by an owning thread. Thus, * when one thread is enqueued onto a turnstile, it can lend its priority * to the owning thread. * * We wish to avoid bloating locks with an embedded turnstile and we do not * want to use back-pointers in the locks for the same reason. Thus, we * use a similar approach to that of Solaris 7 as described in Solaris * Internals by Jim Mauro and Richard McDougall. Turnstiles are looked up * in a hash table based on the address of the lock. Each entry in the * hash table is a linked-lists of turnstiles and is called a turnstile * chain. Each chain contains a spin mutex that protects all of the * turnstiles in the chain. * * Each time a thread is created, a turnstile is allocated from a UMA zone * and attached to that thread. When a thread blocks on a lock, if it is the * first thread to block, it lends its turnstile to the lock. If the lock * already has a turnstile, then it gives its turnstile to the lock's * turnstile's free list. When a thread is woken up, it takes a turnstile from * the free list if there are any other waiters. If it is the only thread * blocked on the lock, then it reclaims the turnstile associated with the lock * and removes it from the hash table. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_turnstile_profiling.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #include #endif /* * Constants for the hash table of turnstile chains. TC_SHIFT is a magic * number chosen because the sleep queue's use the same value for the * shift. Basically, we ignore the lower 8 bits of the address. * TC_TABLESIZE must be a power of two for TC_MASK to work properly. */ #define TC_TABLESIZE 128 /* Must be power of 2. */ #define TC_MASK (TC_TABLESIZE - 1) #define TC_SHIFT 8 #define TC_HASH(lock) (((uintptr_t)(lock) >> TC_SHIFT) & TC_MASK) #define TC_LOOKUP(lock) &turnstile_chains[TC_HASH(lock)] /* * There are three different lists of turnstiles as follows. The list * connected by ts_link entries is a per-thread list of all the turnstiles * attached to locks that we own. This is used to fixup our priority when * a lock is released. The other two lists use the ts_hash entries. The * first of these two is the turnstile chain list that a turnstile is on * when it is attached to a lock. The second list to use ts_hash is the * free list hung off of a turnstile that is attached to a lock. * * Each turnstile contains three lists of threads. The two ts_blocked lists * are linked list of threads blocked on the turnstile's lock. One list is * for exclusive waiters, and the other is for shared waiters. The * ts_pending list is a linked list of threads previously awakened by * turnstile_signal() or turnstile_wait() that are waiting to be put on * the run queue. * * Locking key: * c - turnstile chain lock * q - td_contested lock */ struct turnstile { struct mtx ts_lock; /* Spin lock for self. */ struct threadqueue ts_blocked[2]; /* (c + q) Blocked threads. */ struct threadqueue ts_pending; /* (c) Pending threads. */ LIST_ENTRY(turnstile) ts_hash; /* (c) Chain and free list. */ LIST_ENTRY(turnstile) ts_link; /* (q) Contested locks. */ LIST_HEAD(, turnstile) ts_free; /* (c) Free turnstiles. */ struct lock_object *ts_lockobj; /* (c) Lock we reference. */ struct thread *ts_owner; /* (c + q) Who owns the lock. */ }; struct turnstile_chain { LIST_HEAD(, turnstile) tc_turnstiles; /* List of turnstiles. */ struct mtx tc_lock; /* Spin lock for this chain. */ #ifdef TURNSTILE_PROFILING u_int tc_depth; /* Length of tc_queues. */ u_int tc_max_depth; /* Max length of tc_queues. */ #endif }; #ifdef TURNSTILE_PROFILING u_int turnstile_max_depth; static SYSCTL_NODE(_debug, OID_AUTO, turnstile, CTLFLAG_RD, 0, "turnstile profiling"); static SYSCTL_NODE(_debug_turnstile, OID_AUTO, chains, CTLFLAG_RD, 0, "turnstile chain stats"); SYSCTL_UINT(_debug_turnstile, OID_AUTO, max_depth, CTLFLAG_RD, &turnstile_max_depth, 0, "maximum depth achieved of a single chain"); #endif static struct mtx td_contested_lock; static struct turnstile_chain turnstile_chains[TC_TABLESIZE]; static uma_zone_t turnstile_zone; /* * Prototypes for non-exported routines. */ static void init_turnstile0(void *dummy); #ifdef TURNSTILE_PROFILING static void init_turnstile_profiling(void *arg); #endif static void propagate_priority(struct thread *td); static int turnstile_adjust_thread(struct turnstile *ts, struct thread *td); static struct thread *turnstile_first_waiter(struct turnstile *ts); static void turnstile_setowner(struct turnstile *ts, struct thread *owner); #ifdef INVARIANTS static void turnstile_dtor(void *mem, int size, void *arg); #endif static int turnstile_init(void *mem, int size, int flags); static void turnstile_fini(void *mem, int size); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , sleep); SDT_PROBE_DEFINE2(sched, , , wakeup, "struct thread *", "struct proc *"); /* * Walks the chain of turnstiles and their owners to propagate the priority * of the thread being blocked to all the threads holding locks that have to * release their locks before this thread can run again. */ static void propagate_priority(struct thread *td) { struct turnstile *ts; int pri; THREAD_LOCK_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td->td_blocked; THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); /* * Grab a recursive lock on this turnstile chain so it stays locked * for the whole operation. The caller expects us to return with * the original lock held. We only ever lock down the chain so * the lock order is constant. */ mtx_lock_spin(&ts->ts_lock); for (;;) { td = ts->ts_owner; if (td == NULL) { /* * This might be a read lock with no owner. There's * not much we can do, so just bail. */ mtx_unlock_spin(&ts->ts_lock); return; } thread_lock_flags(td, MTX_DUPOK); mtx_unlock_spin(&ts->ts_lock); MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); /* * If the thread is asleep, then we are probably about * to deadlock. To make debugging this easier, show * backtrace of misbehaving thread and panic to not * leave the kernel deadlocked. */ if (TD_IS_SLEEPING(td)) { printf( "Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n", td->td_tid, td->td_proc->p_pid); kdb_backtrace_thread(td); panic("sleeping thread"); } /* * If this thread already has higher priority than the * thread that is being blocked, we are finished. */ if (td->td_priority <= pri) { thread_unlock(td); return; } /* * Bump this thread's priority. */ sched_lend_prio(td, pri); /* * If lock holder is actually running or on the run queue * then we are done. */ if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) { MPASS(td->td_blocked == NULL); thread_unlock(td); return; } #ifndef SMP /* * For UP, we check to see if td is curthread (this shouldn't * ever happen however as it would mean we are in a deadlock.) */ KASSERT(td != curthread, ("Deadlock detected")); #endif /* * If we aren't blocked on a lock, we should be. */ KASSERT(TD_ON_LOCK(td), ( "thread %d(%s):%d holds %s but isn't blocked on a lock\n", td->td_tid, td->td_name, td->td_state, ts->ts_lockobj->lo_name)); /* * Pick up the lock that td is blocked on. */ ts = td->td_blocked; MPASS(ts != NULL); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); /* Resort td on the list if needed. */ if (!turnstile_adjust_thread(ts, td)) { mtx_unlock_spin(&ts->ts_lock); return; } /* The thread lock is released as ts lock above. */ } } /* * Adjust the thread's position on a turnstile after its priority has been * changed. */ static int turnstile_adjust_thread(struct turnstile *ts, struct thread *td) { struct thread *td1, *td2; int queue; THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_LOCK(td)); /* * This thread may not be blocked on this turnstile anymore * but instead might already be woken up on another CPU * that is waiting on the thread lock in turnstile_unpend() to * finish waking this thread up. We can detect this case * by checking to see if this thread has been given a * turnstile by either turnstile_signal() or * turnstile_broadcast(). In this case, treat the thread as * if it was already running. */ if (td->td_turnstile != NULL) return (0); /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); td1 = TAILQ_PREV(td, threadqueue, td_lockq); td2 = TAILQ_NEXT(td, td_lockq); if ((td1 != NULL && td->td_priority < td1->td_priority) || (td2 != NULL && td->td_priority > td2->td_priority)) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ queue = td->td_tsqueue; MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE); mtx_lock_spin(&td_contested_lock); TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq); TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) { MPASS(td1->td_proc->p_magic == P_MAGIC); if (td1->td_priority > td->td_priority) break; } if (td1 == NULL) TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); else TAILQ_INSERT_BEFORE(td1, td, td_lockq); mtx_unlock_spin(&td_contested_lock); if (td1 == NULL) CTR3(KTR_LOCK, "turnstile_adjust_thread: td %d put at tail on [%p] %s", td->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name); else CTR4(KTR_LOCK, "turnstile_adjust_thread: td %d moved before %d on [%p] %s", td->td_tid, td1->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name); } return (1); } /* * Early initialization of turnstiles. This is not done via a SYSINIT() * since this needs to be initialized very early when mutexes are first * initialized. */ void init_turnstiles(void) { int i; for (i = 0; i < TC_TABLESIZE; i++) { LIST_INIT(&turnstile_chains[i].tc_turnstiles); mtx_init(&turnstile_chains[i].tc_lock, "turnstile chain", NULL, MTX_SPIN); } mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN); LIST_INIT(&thread0.td_contested); thread0.td_turnstile = NULL; } #ifdef TURNSTILE_PROFILING static void init_turnstile_profiling(void *arg) { struct sysctl_oid *chain_oid; char chain_name[10]; int i; for (i = 0; i < TC_TABLESIZE; i++) { snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_turnstile_chains), OID_AUTO, chain_name, CTLFLAG_RD, NULL, "turnstile chain stats"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "depth", CTLFLAG_RD, &turnstile_chains[i].tc_depth, 0, NULL); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_depth", CTLFLAG_RD, &turnstile_chains[i].tc_max_depth, 0, NULL); } } SYSINIT(turnstile_profiling, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile_profiling, NULL); #endif static void init_turnstile0(void *dummy) { turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile), NULL, #ifdef INVARIANTS turnstile_dtor, #else NULL, #endif turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); thread0.td_turnstile = turnstile_alloc(); } SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL); /* * Update a thread on the turnstile list after it's priority has been changed. * The old priority is passed in as an argument. */ void turnstile_adjust(struct thread *td, u_char oldpri) { struct turnstile *ts; MPASS(TD_ON_LOCK(td)); /* * Pick up the lock that td is blocked on. */ ts = td->td_blocked; MPASS(ts != NULL); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); mtx_assert(&ts->ts_lock, MA_OWNED); /* Resort the turnstile on the list. */ if (!turnstile_adjust_thread(ts, td)) return; /* * If our priority was lowered and we are at the head of the * turnstile, then propagate our new priority up the chain. * Note that we currently don't try to revoke lent priorities * when our priority goes up. */ MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE || td->td_tsqueue == TS_SHARED_QUEUE); if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) && td->td_priority < oldpri) { propagate_priority(td); } } /* * Set the owner of the lock this turnstile is attached to. */ static void turnstile_setowner(struct turnstile *ts, struct thread *owner) { mtx_assert(&td_contested_lock, MA_OWNED); MPASS(ts->ts_owner == NULL); /* A shared lock might not have an owner. */ if (owner == NULL) return; MPASS(owner->td_proc->p_magic == P_MAGIC); ts->ts_owner = owner; LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link); } #ifdef INVARIANTS /* * UMA zone item deallocator. */ static void turnstile_dtor(void *mem, int size, void *arg) { struct turnstile *ts; ts = mem; MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE])); MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])); MPASS(TAILQ_EMPTY(&ts->ts_pending)); } #endif /* * UMA zone item initializer. */ static int turnstile_init(void *mem, int size, int flags) { struct turnstile *ts; bzero(mem, size); ts = mem; TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]); TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]); TAILQ_INIT(&ts->ts_pending); LIST_INIT(&ts->ts_free); mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE); return (0); } static void turnstile_fini(void *mem, int size) { struct turnstile *ts; ts = mem; mtx_destroy(&ts->ts_lock); } /* * Get a turnstile for a new thread. */ struct turnstile * turnstile_alloc(void) { return (uma_zalloc(turnstile_zone, M_WAITOK)); } /* * Free a turnstile when a thread is destroyed. */ void turnstile_free(struct turnstile *ts) { uma_zfree(turnstile_zone, ts); } /* * Lock the turnstile chain associated with the specified lock. */ void turnstile_chain_lock(struct lock_object *lock) { struct turnstile_chain *tc; tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); } struct turnstile * turnstile_trywait(struct lock_object *lock) { struct turnstile_chain *tc; struct turnstile *ts; tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) { mtx_lock_spin(&ts->ts_lock); return (ts); } ts = curthread->td_turnstile; MPASS(ts != NULL); mtx_lock_spin(&ts->ts_lock); KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer")); ts->ts_lockobj = lock; return (ts); } void turnstile_cancel(struct turnstile *ts) { struct turnstile_chain *tc; struct lock_object *lock; mtx_assert(&ts->ts_lock, MA_OWNED); mtx_unlock_spin(&ts->ts_lock); lock = ts->ts_lockobj; if (ts == curthread->td_turnstile) ts->ts_lockobj = NULL; tc = TC_LOOKUP(lock); mtx_unlock_spin(&tc->tc_lock); } /* * Look up the turnstile for a lock in the hash table locking the associated * turnstile chain along the way. If no turnstile is found in the hash * table, NULL is returned. */ struct turnstile * turnstile_lookup(struct lock_object *lock) { struct turnstile_chain *tc; struct turnstile *ts; tc = TC_LOOKUP(lock); mtx_assert(&tc->tc_lock, MA_OWNED); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) { mtx_lock_spin(&ts->ts_lock); return (ts); } return (NULL); } /* * Unlock the turnstile chain associated with a given lock. */ void turnstile_chain_unlock(struct lock_object *lock) { struct turnstile_chain *tc; tc = TC_LOOKUP(lock); mtx_unlock_spin(&tc->tc_lock); } /* * Return a pointer to the thread waiting on this turnstile with the * most important priority or NULL if the turnstile has no waiters. */ static struct thread * turnstile_first_waiter(struct turnstile *ts) { struct thread *std, *xtd; std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]); xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]); if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority)) return (std); return (xtd); } /* * Take ownership of a turnstile and adjust the priority of the new * owner appropriately. */ void turnstile_claim(struct turnstile *ts) { struct thread *td, *owner; struct turnstile_chain *tc; mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts != curthread->td_turnstile); owner = curthread; mtx_lock_spin(&td_contested_lock); turnstile_setowner(ts, owner); mtx_unlock_spin(&td_contested_lock); td = turnstile_first_waiter(ts); MPASS(td != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); /* * Update the priority of the new owner if needed. */ thread_lock(owner); if (td->td_priority < owner->td_priority) sched_lend_prio(owner, td->td_priority); thread_unlock(owner); tc = TC_LOOKUP(ts->ts_lockobj); mtx_unlock_spin(&ts->ts_lock); mtx_unlock_spin(&tc->tc_lock); } /* * Block the current thread on the turnstile assicated with 'lock'. This * function will context switch and not return until this thread has been * woken back up. This function must be called with the appropriate * turnstile chain locked and will return with it unlocked. */ void turnstile_wait(struct turnstile *ts, struct thread *owner, int queue) { struct turnstile_chain *tc; struct thread *td, *td1; struct lock_object *lock; td = curthread; mtx_assert(&ts->ts_lock, MA_OWNED); if (owner) MPASS(owner->td_proc->p_magic == P_MAGIC); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * If the lock does not already have a turnstile, use this thread's * turnstile. Otherwise insert the current thread into the * turnstile already in use by this lock. */ tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); if (ts == td->td_turnstile) { #ifdef TURNSTILE_PROFILING tc->tc_depth++; if (tc->tc_depth > tc->tc_max_depth) { tc->tc_max_depth = tc->tc_depth; if (tc->tc_max_depth > turnstile_max_depth) turnstile_max_depth = tc->tc_max_depth; } #endif LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash); KASSERT(TAILQ_EMPTY(&ts->ts_pending), ("thread's turnstile has pending threads")); KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]), ("thread's turnstile has exclusive waiters")); KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]), ("thread's turnstile has shared waiters")); KASSERT(LIST_EMPTY(&ts->ts_free), ("thread's turnstile has a non-empty free list")); MPASS(ts->ts_lockobj != NULL); mtx_lock_spin(&td_contested_lock); TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); turnstile_setowner(ts, owner); mtx_unlock_spin(&td_contested_lock); } else { TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) if (td1->td_priority > td->td_priority) break; mtx_lock_spin(&td_contested_lock); if (td1 != NULL) TAILQ_INSERT_BEFORE(td1, td, td_lockq); else TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); MPASS(owner == ts->ts_owner); mtx_unlock_spin(&td_contested_lock); MPASS(td->td_turnstile != NULL); LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash); } thread_lock(td); thread_lock_set(td, &ts->ts_lock); td->td_turnstile = NULL; /* Save who we are blocked on and switch. */ lock = ts->ts_lockobj; td->td_tsqueue = queue; td->td_blocked = ts; td->td_lockname = lock->lo_name; td->td_blktick = ticks; TD_SET_LOCK(td); mtx_unlock_spin(&tc->tc_lock); propagate_priority(td); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); SDT_PROBE0(sched, , , sleep); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); mi_switch(SW_VOL | SWT_TURNSTILE, NULL); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); thread_unlock(td); } /* * Pick the highest priority thread on this turnstile and put it on the * pending list. This must be called with the turnstile chain locked. */ int turnstile_signal(struct turnstile *ts, int queue) { struct turnstile_chain *tc; struct thread *td; int empty; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * Pick the highest priority thread blocked on this lock and * move it to the pending list. */ td = TAILQ_FIRST(&ts->ts_blocked[queue]); MPASS(td->td_proc->p_magic == P_MAGIC); mtx_lock_spin(&td_contested_lock); TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq); mtx_unlock_spin(&td_contested_lock); TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq); /* * If the turnstile is now empty, remove it from its chain and * give it to the about-to-be-woken thread. Otherwise take a * turnstile from the free list and give it to the thread. */ empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) && TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]); if (empty) { tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(LIST_EMPTY(&ts->ts_free)); #ifdef TURNSTILE_PROFILING tc->tc_depth--; #endif } else ts = LIST_FIRST(&ts->ts_free); MPASS(ts != NULL); LIST_REMOVE(ts, ts_hash); td->td_turnstile = ts; return (empty); } /* * Put all blocked threads on the pending list. This must be called with * the turnstile chain locked. */ void turnstile_broadcast(struct turnstile *ts, int queue) { struct turnstile_chain *tc; struct turnstile *ts1; struct thread *td; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); /* * We must have the chain locked so that we can remove the empty * turnstile from the hash queue. */ tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * Transfer the blocked list to the pending list. */ mtx_lock_spin(&td_contested_lock); TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq); mtx_unlock_spin(&td_contested_lock); /* * Give a turnstile to each thread. The last thread gets * this turnstile if the turnstile is empty. */ TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) { if (LIST_EMPTY(&ts->ts_free)) { MPASS(TAILQ_NEXT(td, td_lockq) == NULL); ts1 = ts; #ifdef TURNSTILE_PROFILING tc->tc_depth--; #endif } else ts1 = LIST_FIRST(&ts->ts_free); MPASS(ts1 != NULL); LIST_REMOVE(ts1, ts_hash); td->td_turnstile = ts1; } } /* * Wakeup all threads on the pending list and adjust the priority of the * current thread appropriately. This must be called with the turnstile * chain locked. */ void turnstile_unpend(struct turnstile *ts, int owner_type) { TAILQ_HEAD( ,thread) pending_threads; struct turnstile *nts; struct thread *td; u_char cp, pri; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); MPASS(!TAILQ_EMPTY(&ts->ts_pending)); /* * Move the list of pending threads out of the turnstile and * into a local variable. */ TAILQ_INIT(&pending_threads); TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq); #ifdef INVARIANTS if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) && TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])) ts->ts_lockobj = NULL; #endif /* * Adjust the priority of curthread based on other contested * locks it owns. Don't lower the priority below the base * priority however. */ td = curthread; pri = PRI_MAX; thread_lock(td); mtx_lock_spin(&td_contested_lock); /* * Remove the turnstile from this thread's list of contested locks * since this thread doesn't own it anymore. New threads will * not be blocking on the turnstile until it is claimed by a new * owner. There might not be a current owner if this is a shared * lock. */ if (ts->ts_owner != NULL) { ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); } LIST_FOREACH(nts, &td->td_contested, ts_link) { cp = turnstile_first_waiter(nts)->td_priority; if (cp < pri) pri = cp; } mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); thread_unlock(td); /* * Wake up all the pending threads. If a thread is not blocked * on a lock, then it is currently executing on another CPU in * turnstile_wait() or sitting on a run queue waiting to resume * in turnstile_wait(). Set a flag to force it to try to acquire * the lock again instead of blocking. */ while (!TAILQ_EMPTY(&pending_threads)) { td = TAILQ_FIRST(&pending_threads); TAILQ_REMOVE(&pending_threads, td, td_lockq); SDT_PROBE2(sched, , , wakeup, td, td->td_proc); thread_lock(td); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); MPASS(td->td_proc->p_magic == P_MAGIC); MPASS(TD_ON_LOCK(td)); TD_CLR_LOCK(td); MPASS(TD_CAN_RUN(td)); td->td_blocked = NULL; td->td_lockname = NULL; td->td_blktick = 0; #ifdef INVARIANTS td->td_tsqueue = 0xff; #endif sched_add(td, SRQ_BORING); thread_unlock(td); } mtx_unlock_spin(&ts->ts_lock); } /* * Give up ownership of a turnstile. This must be called with the * turnstile chain locked. */ void turnstile_disown(struct turnstile *ts) { struct thread *td; u_char cp, pri; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread); MPASS(TAILQ_EMPTY(&ts->ts_pending)); MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) || !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])); /* * Remove the turnstile from this thread's list of contested locks * since this thread doesn't own it anymore. New threads will * not be blocking on the turnstile until it is claimed by a new * owner. */ mtx_lock_spin(&td_contested_lock); ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); mtx_unlock_spin(&td_contested_lock); /* * Adjust the priority of curthread based on other contested * locks it owns. Don't lower the priority below the base * priority however. */ td = curthread; pri = PRI_MAX; thread_lock(td); mtx_unlock_spin(&ts->ts_lock); mtx_lock_spin(&td_contested_lock); LIST_FOREACH(ts, &td->td_contested, ts_link) { cp = turnstile_first_waiter(ts)->td_priority; if (cp < pri) pri = cp; } mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); thread_unlock(td); } /* * Return the first thread in a turnstile. */ struct thread * turnstile_head(struct turnstile *ts, int queue) { #ifdef INVARIANTS MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_FIRST(&ts->ts_blocked[queue])); } /* * Returns true if a sub-queue of a turnstile is empty. */ int turnstile_empty(struct turnstile *ts, int queue) { #ifdef INVARIANTS MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_EMPTY(&ts->ts_blocked[queue])); } #ifdef DDB static void print_thread(struct thread *td, const char *prefix) { db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid, td->td_proc->p_pid, td->td_name); } static void print_queue(struct threadqueue *queue, const char *header, const char *prefix) { struct thread *td; db_printf("%s:\n", header); if (TAILQ_EMPTY(queue)) { db_printf("%sempty\n", prefix); return; } TAILQ_FOREACH(td, queue, td_lockq) { print_thread(td, prefix); } } DB_SHOW_COMMAND(turnstile, db_show_turnstile) { struct turnstile_chain *tc; struct turnstile *ts; struct lock_object *lock; int i; if (!have_addr) return; /* * First, see if there is an active turnstile for the lock indicated * by the address. */ lock = (struct lock_object *)addr; tc = TC_LOOKUP(lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) goto found; /* * Second, see if there is an active turnstile at the address * indicated. */ for (i = 0; i < TC_TABLESIZE; i++) LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) { if (ts == (struct turnstile *)addr) goto found; } db_printf("Unable to locate a turnstile via %p\n", (void *)addr); return; found: lock = ts->ts_lockobj; db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name, lock->lo_name); if (ts->ts_owner) print_thread(ts->ts_owner, "Lock Owner: "); else db_printf("Lock Owner: none\n"); print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t"); print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters", "\t"); print_queue(&ts->ts_pending, "Pending Threads", "\t"); } /* * Show all the threads a particular thread is waiting on based on * non-spin locks. */ static void print_lockchain(struct thread *td, const char *prefix) { struct lock_object *lock; struct lock_class *class; struct turnstile *ts; struct thread *owner; /* * Follow the chain. We keep walking as long as the thread is * blocked on a lock that has an owner. */ while (!db_pager_quit) { db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid, td->td_proc->p_pid, td->td_name); switch (td->td_state) { case TDS_INACTIVE: db_printf("is inactive\n"); return; case TDS_CAN_RUN: db_printf("can run\n"); return; case TDS_RUNQ: db_printf("is on a run queue\n"); return; case TDS_RUNNING: db_printf("running on CPU %d\n", td->td_oncpu); return; case TDS_INHIBITED: if (TD_ON_LOCK(td)) { ts = td->td_blocked; lock = ts->ts_lockobj; class = LOCK_CLASS(lock); db_printf("blocked on lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); if (ts->ts_owner == NULL) return; td = ts->ts_owner; break; } else if (TD_ON_SLEEPQ(td)) { if (!lockmgr_chain(td, &owner) && !sx_chain(td, &owner)) { db_printf("sleeping on %p \"%s\"\n", td->td_wchan, td->td_wmesg); return; } if (owner == NULL) return; td = owner; break; } db_printf("inhibited\n"); return; default: db_printf("??? (%#x)\n", td->td_state); return; } } } DB_SHOW_COMMAND(lockchain, db_show_lockchain) { struct thread *td; /* Figure out which thread to start with. */ if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; print_lockchain(td, ""); } DB_SHOW_ALIAS(sleepchain, db_show_lockchain); DB_SHOW_ALL_COMMAND(chains, db_show_allchains) { struct thread *td; struct proc *p; int i; i = 1; FOREACH_PROC_IN_SYSTEM(p) { FOREACH_THREAD_IN_PROC(p, td) { if ((TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) || (TD_IS_INHIBITED(td) && TD_ON_SLEEPQ(td))) { db_printf("chain %d:\n", i++); print_lockchain(td, " "); } if (db_pager_quit) return; } } } DB_SHOW_ALIAS(allchains, db_show_allchains) static void print_waiters(struct turnstile *ts, int indent); static void print_waiter(struct thread *td, int indent) { struct turnstile *ts; int i; if (db_pager_quit) return; for (i = 0; i < indent; i++) db_printf(" "); print_thread(td, "thread "); LIST_FOREACH(ts, &td->td_contested, ts_link) print_waiters(ts, indent + 1); } static void print_waiters(struct turnstile *ts, int indent) { struct lock_object *lock; struct lock_class *class; struct thread *td; int i; if (db_pager_quit) return; lock = ts->ts_lockobj; class = LOCK_CLASS(lock); for (i = 0; i < indent; i++) db_printf(" "); db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq) print_waiter(td, indent + 1); TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq) print_waiter(td, indent + 1); TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) print_waiter(td, indent + 1); } DB_SHOW_COMMAND(locktree, db_show_locktree) { struct lock_object *lock; struct lock_class *class; struct turnstile_chain *tc; struct turnstile *ts; if (!have_addr) return; lock = (struct lock_object *)addr; tc = TC_LOOKUP(lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) break; if (ts == NULL) { class = LOCK_CLASS(lock); db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); } else print_waiters(ts, 0); } #endif Index: head/sys/kern/subr_unit.c =================================================================== --- head/sys/kern/subr_unit.c (revision 326270) +++ head/sys/kern/subr_unit.c (revision 326271) @@ -1,1079 +1,1081 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * * * Unit number allocation functions. * * These functions implement a mixed run-length/bitmap management of unit * number spaces in a very memory efficient manner. * * Allocation policy is always lowest free number first. * * A return value of -1 signals that no more unit numbers are available. * * There is no cost associated with the range of unitnumbers, so unless * the resource really is finite, specify INT_MAX to new_unrhdr() and * forget about checking the return value. * * If a mutex is not provided when the unit number space is created, a * default global mutex is used. The advantage to passing a mutex in, is * that the alloc_unrl() function can be called with the mutex already * held (it will not be released by alloc_unrl()). * * The allocation function alloc_unr{l}() never sleeps (but it may block on * the mutex of course). * * Freeing a unit number may require allocating memory, and can therefore * sleep so the free_unr() function does not come in a pre-locked variant. * * A userland test program is included. * * Memory usage is a very complex function of the exact allocation * pattern, but always very compact: * * For the very typical case where a single unbroken run of unit * numbers are allocated 44 bytes are used on i386. * * For a unit number space of 1000 units and the random pattern * in the usermode test program included, the worst case usage * was 252 bytes on i386 for 500 allocated and 500 free units. * * For a unit number space of 10000 units and the random pattern * in the usermode test program included, the worst case usage * was 798 bytes on i386 for 5000 allocated and 5000 free units. * * The worst case is where every other unit number is allocated and * the rest are free. In that case 44 + N/4 bytes are used where * N is the number of the highest unit allocated. */ #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #include /* * In theory it would be smarter to allocate the individual blocks * with the zone allocator, but at this time the expectation is that * there will typically not even be enough allocations to fill a single * page, so we stick with malloc for now. */ static MALLOC_DEFINE(M_UNIT, "Unitno", "Unit number allocation"); #define Malloc(foo) malloc(foo, M_UNIT, M_WAITOK | M_ZERO) #define Free(foo) free(foo, M_UNIT) static struct mtx unitmtx; MTX_SYSINIT(unit, &unitmtx, "unit# allocation", MTX_DEF); #else /* ...USERLAND */ #include #include #include #include #include #include #include #include #define KASSERT(cond, arg) \ do { \ if (!(cond)) { \ printf arg; \ abort(); \ } \ } while (0) static int no_alloc; #define Malloc(foo) _Malloc(foo, __LINE__) static void * _Malloc(size_t foo, int line) { KASSERT(no_alloc == 0, ("malloc in wrong place() line %d", line)); return (calloc(foo, 1)); } #define Free(foo) free(foo) struct unrhdr; struct mtx { int state; } unitmtx; static void mtx_lock(struct mtx *mp) { KASSERT(mp->state == 0, ("mutex already locked")); mp->state = 1; } static void mtx_unlock(struct mtx *mp) { KASSERT(mp->state == 1, ("mutex not locked")); mp->state = 0; } #define MA_OWNED 9 static void mtx_assert(struct mtx *mp, int flag) { if (flag == MA_OWNED) { KASSERT(mp->state == 1, ("mtx_assert(MA_OWNED) not true")); } } #define CTASSERT(foo) #define WITNESS_WARN(flags, lock, fmt, ...) (void)0 #endif /* USERLAND */ /* * This is our basic building block. * * It can be used in three different ways depending on the value of the ptr * element: * If ptr is NULL, it represents a run of free items. * If ptr points to the unrhdr it represents a run of allocated items. * Otherwise it points to a bitstring of allocated items. * * For runs the len field is the length of the run. * For bitmaps the len field represents the number of allocated items. * * The bitmap is the same size as struct unr to optimize memory management. */ struct unr { TAILQ_ENTRY(unr) list; u_int len; void *ptr; }; struct unrb { bitstr_t map[sizeof(struct unr) / sizeof(bitstr_t)]; }; CTASSERT((sizeof(struct unr) % sizeof(bitstr_t)) == 0); /* Number of bits we can store in the bitmap */ #define NBITS (8 * sizeof(((struct unrb*)NULL)->map)) /* Is the unrb empty in at least the first len bits? */ static inline bool ub_empty(struct unrb *ub, int len) { int first_set; bit_ffs(ub->map, len, &first_set); return (first_set == -1); } /* Is the unrb full? That is, is the number of set elements equal to len? */ static inline bool ub_full(struct unrb *ub, int len) { int first_clear; bit_ffc(ub->map, len, &first_clear); return (first_clear == -1); } #if defined(DIAGNOSTIC) || !defined(_KERNEL) /* * Consistency check function. * * Checks the internal consistency as well as we can. * * Called at all boundaries of this API. */ static void check_unrhdr(struct unrhdr *uh, int line) { struct unr *up; struct unrb *ub; int w; u_int y, z; y = uh->first; z = 0; TAILQ_FOREACH(up, &uh->head, list) { z++; if (up->ptr != uh && up->ptr != NULL) { ub = up->ptr; KASSERT (up->len <= NBITS, ("UNR inconsistency: len %u max %zd (line %d)\n", up->len, NBITS, line)); z++; w = 0; bit_count(ub->map, 0, up->len, &w); y += w; } else if (up->ptr != NULL) y += up->len; } KASSERT (y == uh->busy, ("UNR inconsistency: items %u found %u (line %d)\n", uh->busy, y, line)); KASSERT (z == uh->alloc, ("UNR inconsistency: chunks %u found %u (line %d)\n", uh->alloc, z, line)); } #else static __inline void check_unrhdr(struct unrhdr *uh __unused, int line __unused) { } #endif /* * Userland memory management. Just use calloc and keep track of how * many elements we have allocated for check_unrhdr(). */ static __inline void * new_unr(struct unrhdr *uh, void **p1, void **p2) { void *p; uh->alloc++; KASSERT(*p1 != NULL || *p2 != NULL, ("Out of cached memory")); if (*p1 != NULL) { p = *p1; *p1 = NULL; return (p); } else { p = *p2; *p2 = NULL; return (p); } } static __inline void delete_unr(struct unrhdr *uh, void *ptr) { struct unr *up; uh->alloc--; up = ptr; TAILQ_INSERT_TAIL(&uh->ppfree, up, list); } void clean_unrhdrl(struct unrhdr *uh) { struct unr *up; mtx_assert(uh->mtx, MA_OWNED); while ((up = TAILQ_FIRST(&uh->ppfree)) != NULL) { TAILQ_REMOVE(&uh->ppfree, up, list); mtx_unlock(uh->mtx); Free(up); mtx_lock(uh->mtx); } } void clean_unrhdr(struct unrhdr *uh) { mtx_lock(uh->mtx); clean_unrhdrl(uh); mtx_unlock(uh->mtx); } void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex) { KASSERT(low >= 0 && low <= high, ("UNR: use error: new_unrhdr(%d, %d)", low, high)); if (mutex != NULL) uh->mtx = mutex; else uh->mtx = &unitmtx; TAILQ_INIT(&uh->head); TAILQ_INIT(&uh->ppfree); uh->low = low; uh->high = high; uh->first = 0; uh->last = 1 + (high - low); check_unrhdr(uh, __LINE__); } /* * Allocate a new unrheader set. * * Highest and lowest valid values given as parameters. */ struct unrhdr * new_unrhdr(int low, int high, struct mtx *mutex) { struct unrhdr *uh; uh = Malloc(sizeof *uh); init_unrhdr(uh, low, high, mutex); return (uh); } void delete_unrhdr(struct unrhdr *uh) { check_unrhdr(uh, __LINE__); KASSERT(uh->busy == 0, ("unrhdr has %u allocations", uh->busy)); KASSERT(uh->alloc == 0, ("UNR memory leak in delete_unrhdr")); KASSERT(TAILQ_FIRST(&uh->ppfree) == NULL, ("unrhdr has postponed item for free")); Free(uh); } void clear_unrhdr(struct unrhdr *uh) { struct unr *up, *uq; KASSERT(TAILQ_EMPTY(&uh->ppfree), ("unrhdr has postponed item for free")); TAILQ_FOREACH_SAFE(up, &uh->head, list, uq) { if (up->ptr != uh) { Free(up->ptr); } Free(up); } uh->busy = 0; uh->alloc = 0; init_unrhdr(uh, uh->low, uh->high, uh->mtx); check_unrhdr(uh, __LINE__); } static __inline int is_bitmap(struct unrhdr *uh, struct unr *up) { return (up->ptr != uh && up->ptr != NULL); } /* * Look for sequence of items which can be combined into a bitmap, if * multiple are present, take the one which saves most memory. * * Return (1) if a sequence was found to indicate that another call * might be able to do more. Return (0) if we found no suitable sequence. * * NB: called from alloc_unr(), no new memory allocation allowed. */ static int optimize_unr(struct unrhdr *uh) { struct unr *up, *uf, *us; struct unrb *ub, *ubf; u_int a, l, ba; /* * Look for the run of items (if any) which when collapsed into * a bitmap would save most memory. */ us = NULL; ba = 0; TAILQ_FOREACH(uf, &uh->head, list) { if (uf->len >= NBITS) continue; a = 1; if (is_bitmap(uh, uf)) a++; l = uf->len; up = uf; while (1) { up = TAILQ_NEXT(up, list); if (up == NULL) break; if ((up->len + l) > NBITS) break; a++; if (is_bitmap(uh, up)) a++; l += up->len; } if (a > ba) { ba = a; us = uf; } } if (ba < 3) return (0); /* * If the first element is not a bitmap, make it one. * Trying to do so without allocating more memory complicates things * a bit */ if (!is_bitmap(uh, us)) { uf = TAILQ_NEXT(us, list); TAILQ_REMOVE(&uh->head, us, list); a = us->len; l = us->ptr == uh ? 1 : 0; ub = (void *)us; bit_nclear(ub->map, 0, NBITS - 1); if (l) bit_nset(ub->map, 0, a); if (!is_bitmap(uh, uf)) { if (uf->ptr == NULL) bit_nclear(ub->map, a, a + uf->len - 1); else bit_nset(ub->map, a, a + uf->len - 1); uf->ptr = ub; uf->len += a; us = uf; } else { ubf = uf->ptr; for (l = 0; l < uf->len; l++, a++) { if (bit_test(ubf->map, l)) bit_set(ub->map, a); else bit_clear(ub->map, a); } uf->len = a; delete_unr(uh, uf->ptr); uf->ptr = ub; us = uf; } } ub = us->ptr; while (1) { uf = TAILQ_NEXT(us, list); if (uf == NULL) return (1); if (uf->len + us->len > NBITS) return (1); if (uf->ptr == NULL) { bit_nclear(ub->map, us->len, us->len + uf->len - 1); us->len += uf->len; TAILQ_REMOVE(&uh->head, uf, list); delete_unr(uh, uf); } else if (uf->ptr == uh) { bit_nset(ub->map, us->len, us->len + uf->len - 1); us->len += uf->len; TAILQ_REMOVE(&uh->head, uf, list); delete_unr(uh, uf); } else { ubf = uf->ptr; for (l = 0; l < uf->len; l++, us->len++) { if (bit_test(ubf->map, l)) bit_set(ub->map, us->len); else bit_clear(ub->map, us->len); } TAILQ_REMOVE(&uh->head, uf, list); delete_unr(uh, ubf); delete_unr(uh, uf); } } } /* * See if a given unr should be collapsed with a neighbor. * * NB: called from alloc_unr(), no new memory allocation allowed. */ static void collapse_unr(struct unrhdr *uh, struct unr *up) { struct unr *upp; struct unrb *ub; /* If bitmap is all set or clear, change it to runlength */ if (is_bitmap(uh, up)) { ub = up->ptr; if (ub_full(ub, up->len)) { delete_unr(uh, up->ptr); up->ptr = uh; } else if (ub_empty(ub, up->len)) { delete_unr(uh, up->ptr); up->ptr = NULL; } } /* If nothing left in runlength, delete it */ if (up->len == 0) { upp = TAILQ_PREV(up, unrhd, list); if (upp == NULL) upp = TAILQ_NEXT(up, list); TAILQ_REMOVE(&uh->head, up, list); delete_unr(uh, up); up = upp; } /* If we have "hot-spot" still, merge with neighbor if possible */ if (up != NULL) { upp = TAILQ_PREV(up, unrhd, list); if (upp != NULL && up->ptr == upp->ptr) { up->len += upp->len; TAILQ_REMOVE(&uh->head, upp, list); delete_unr(uh, upp); } upp = TAILQ_NEXT(up, list); if (upp != NULL && up->ptr == upp->ptr) { up->len += upp->len; TAILQ_REMOVE(&uh->head, upp, list); delete_unr(uh, upp); } } /* Merge into ->first if possible */ upp = TAILQ_FIRST(&uh->head); if (upp != NULL && upp->ptr == uh) { uh->first += upp->len; TAILQ_REMOVE(&uh->head, upp, list); delete_unr(uh, upp); if (up == upp) up = NULL; } /* Merge into ->last if possible */ upp = TAILQ_LAST(&uh->head, unrhd); if (upp != NULL && upp->ptr == NULL) { uh->last += upp->len; TAILQ_REMOVE(&uh->head, upp, list); delete_unr(uh, upp); if (up == upp) up = NULL; } /* Try to make bitmaps */ while (optimize_unr(uh)) continue; } /* * Allocate a free unr. */ int alloc_unrl(struct unrhdr *uh) { struct unr *up; struct unrb *ub; u_int x; int y; mtx_assert(uh->mtx, MA_OWNED); check_unrhdr(uh, __LINE__); x = uh->low + uh->first; up = TAILQ_FIRST(&uh->head); /* * If we have an ideal split, just adjust the first+last */ if (up == NULL && uh->last > 0) { uh->first++; uh->last--; uh->busy++; return (x); } /* * We can always allocate from the first list element, so if we have * nothing on the list, we must have run out of unit numbers. */ if (up == NULL) return (-1); KASSERT(up->ptr != uh, ("UNR first element is allocated")); if (up->ptr == NULL) { /* free run */ uh->first++; up->len--; } else { /* bitmap */ ub = up->ptr; bit_ffc(ub->map, up->len, &y); KASSERT(y != -1, ("UNR corruption: No clear bit in bitmap.")); bit_set(ub->map, y); x += y; } uh->busy++; collapse_unr(uh, up); return (x); } int alloc_unr(struct unrhdr *uh) { int i; mtx_lock(uh->mtx); i = alloc_unrl(uh); clean_unrhdrl(uh); mtx_unlock(uh->mtx); return (i); } static int alloc_unr_specificl(struct unrhdr *uh, u_int item, void **p1, void **p2) { struct unr *up, *upn; struct unrb *ub; u_int i, last, tl; mtx_assert(uh->mtx, MA_OWNED); if (item < uh->low + uh->first || item > uh->high) return (-1); up = TAILQ_FIRST(&uh->head); /* Ideal split. */ if (up == NULL && item - uh->low == uh->first) { uh->first++; uh->last--; uh->busy++; check_unrhdr(uh, __LINE__); return (item); } i = item - uh->low - uh->first; if (up == NULL) { up = new_unr(uh, p1, p2); up->ptr = NULL; up->len = i; TAILQ_INSERT_TAIL(&uh->head, up, list); up = new_unr(uh, p1, p2); up->ptr = uh; up->len = 1; TAILQ_INSERT_TAIL(&uh->head, up, list); uh->last = uh->high - uh->low - i; uh->busy++; check_unrhdr(uh, __LINE__); return (item); } else { /* Find the item which contains the unit we want to allocate. */ TAILQ_FOREACH(up, &uh->head, list) { if (up->len > i) break; i -= up->len; } } if (up == NULL) { if (i > 0) { up = new_unr(uh, p1, p2); up->ptr = NULL; up->len = i; TAILQ_INSERT_TAIL(&uh->head, up, list); } up = new_unr(uh, p1, p2); up->ptr = uh; up->len = 1; TAILQ_INSERT_TAIL(&uh->head, up, list); goto done; } if (is_bitmap(uh, up)) { ub = up->ptr; if (bit_test(ub->map, i) == 0) { bit_set(ub->map, i); goto done; } else return (-1); } else if (up->ptr == uh) return (-1); KASSERT(up->ptr == NULL, ("alloc_unr_specificl: up->ptr != NULL (up=%p)", up)); /* Split off the tail end, if any. */ tl = up->len - (1 + i); if (tl > 0) { upn = new_unr(uh, p1, p2); upn->ptr = NULL; upn->len = tl; TAILQ_INSERT_AFTER(&uh->head, up, upn, list); } /* Split off head end, if any */ if (i > 0) { upn = new_unr(uh, p1, p2); upn->len = i; upn->ptr = NULL; TAILQ_INSERT_BEFORE(up, upn, list); } up->len = 1; up->ptr = uh; done: last = uh->high - uh->low - (item - uh->low); if (uh->last > last) uh->last = last; uh->busy++; collapse_unr(uh, up); check_unrhdr(uh, __LINE__); return (item); } int alloc_unr_specific(struct unrhdr *uh, u_int item) { void *p1, *p2; int i; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "alloc_unr_specific"); p1 = Malloc(sizeof(struct unr)); p2 = Malloc(sizeof(struct unr)); mtx_lock(uh->mtx); i = alloc_unr_specificl(uh, item, &p1, &p2); mtx_unlock(uh->mtx); if (p1 != NULL) Free(p1); if (p2 != NULL) Free(p2); return (i); } /* * Free a unr. * * If we can save unrs by using a bitmap, do so. */ static void free_unrl(struct unrhdr *uh, u_int item, void **p1, void **p2) { struct unr *up, *upp, *upn; struct unrb *ub; u_int pl; KASSERT(item >= uh->low && item <= uh->high, ("UNR: free_unr(%u) out of range [%u...%u]", item, uh->low, uh->high)); check_unrhdr(uh, __LINE__); item -= uh->low; upp = TAILQ_FIRST(&uh->head); /* * Freeing in the ideal split case */ if (item + 1 == uh->first && upp == NULL) { uh->last++; uh->first--; uh->busy--; check_unrhdr(uh, __LINE__); return; } /* * Freeing in the ->first section. Create a run starting at the * freed item. The code below will subdivide it. */ if (item < uh->first) { up = new_unr(uh, p1, p2); up->ptr = uh; up->len = uh->first - item; TAILQ_INSERT_HEAD(&uh->head, up, list); uh->first -= up->len; } item -= uh->first; /* Find the item which contains the unit we want to free */ TAILQ_FOREACH(up, &uh->head, list) { if (up->len > item) break; item -= up->len; } /* Handle bitmap items */ if (is_bitmap(uh, up)) { ub = up->ptr; KASSERT(bit_test(ub->map, item) != 0, ("UNR: Freeing free item %d (bitmap)\n", item)); bit_clear(ub->map, item); uh->busy--; collapse_unr(uh, up); return; } KASSERT(up->ptr == uh, ("UNR Freeing free item %d (run))\n", item)); /* Just this one left, reap it */ if (up->len == 1) { up->ptr = NULL; uh->busy--; collapse_unr(uh, up); return; } /* Check if we can shift the item into the previous 'free' run */ upp = TAILQ_PREV(up, unrhd, list); if (item == 0 && upp != NULL && upp->ptr == NULL) { upp->len++; up->len--; uh->busy--; collapse_unr(uh, up); return; } /* Check if we can shift the item to the next 'free' run */ upn = TAILQ_NEXT(up, list); if (item == up->len - 1 && upn != NULL && upn->ptr == NULL) { upn->len++; up->len--; uh->busy--; collapse_unr(uh, up); return; } /* Split off the tail end, if any. */ pl = up->len - (1 + item); if (pl > 0) { upp = new_unr(uh, p1, p2); upp->ptr = uh; upp->len = pl; TAILQ_INSERT_AFTER(&uh->head, up, upp, list); } /* Split off head end, if any */ if (item > 0) { upp = new_unr(uh, p1, p2); upp->len = item; upp->ptr = uh; TAILQ_INSERT_BEFORE(up, upp, list); } up->len = 1; up->ptr = NULL; uh->busy--; collapse_unr(uh, up); } void free_unr(struct unrhdr *uh, u_int item) { void *p1, *p2; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "free_unr"); p1 = Malloc(sizeof(struct unr)); p2 = Malloc(sizeof(struct unr)); mtx_lock(uh->mtx); free_unrl(uh, item, &p1, &p2); clean_unrhdrl(uh); mtx_unlock(uh->mtx); if (p1 != NULL) Free(p1); if (p2 != NULL) Free(p2); } #ifndef _KERNEL /* USERLAND test driver */ /* * Simple stochastic test driver for the above functions. The code resides * here so that it can access static functions and structures. */ static bool verbose; #define VPRINTF(...) {if (verbose) printf(__VA_ARGS__);} static void print_unr(struct unrhdr *uh, struct unr *up) { u_int x; struct unrb *ub; printf(" %p len = %5u ", up, up->len); if (up->ptr == NULL) printf("free\n"); else if (up->ptr == uh) printf("alloc\n"); else { ub = up->ptr; printf("bitmap ["); for (x = 0; x < up->len; x++) { if (bit_test(ub->map, x)) printf("#"); else printf(" "); } printf("]\n"); } } static void print_unrhdr(struct unrhdr *uh) { struct unr *up; u_int x; printf( "%p low = %u high = %u first = %u last = %u busy %u chunks = %u\n", uh, uh->low, uh->high, uh->first, uh->last, uh->busy, uh->alloc); x = uh->low + uh->first; TAILQ_FOREACH(up, &uh->head, list) { printf(" from = %5u", x); print_unr(uh, up); if (up->ptr == NULL || up->ptr == uh) x += up->len; else x += NBITS; } } static void test_alloc_unr(struct unrhdr *uh, u_int i, char a[]) { int j; if (a[i]) { VPRINTF("F %u\n", i); free_unr(uh, i); a[i] = 0; } else { no_alloc = 1; j = alloc_unr(uh); if (j != -1) { a[j] = 1; VPRINTF("A %d\n", j); } no_alloc = 0; } } static void test_alloc_unr_specific(struct unrhdr *uh, u_int i, char a[]) { int j; j = alloc_unr_specific(uh, i); if (j == -1) { VPRINTF("F %u\n", i); a[i] = 0; free_unr(uh, i); } else { a[i] = 1; VPRINTF("A %d\n", j); } } static void usage(char** argv) { printf("%s [-h] [-r REPETITIONS] [-v]\n", argv[0]); } int main(int argc, char **argv) { struct unrhdr *uh; char *a; long count = 10000; /* Number of unrs to test */ long reps = 1, m; int ch; u_int i, j; verbose = false; while ((ch = getopt(argc, argv, "hr:v")) != -1) { switch (ch) { case 'r': errno = 0; reps = strtol(optarg, NULL, 0); if (errno == ERANGE || errno == EINVAL) { usage(argv); exit(2); } break; case 'v': verbose = true; break; case 'h': default: usage(argv); exit(2); } } setbuf(stdout, NULL); uh = new_unrhdr(0, count - 1, NULL); print_unrhdr(uh); a = calloc(count, sizeof(char)); if (a == NULL) err(1, "calloc failed"); srandomdev(); printf("sizeof(struct unr) %zu\n", sizeof(struct unr)); printf("sizeof(struct unrb) %zu\n", sizeof(struct unrb)); printf("sizeof(struct unrhdr) %zu\n", sizeof(struct unrhdr)); printf("NBITS %lu\n", (unsigned long)NBITS); for (m = 0; m < count * reps; m++) { j = random(); i = (j >> 1) % count; #if 0 if (a[i] && (j & 1)) continue; #endif if ((random() & 1) != 0) test_alloc_unr(uh, i, a); else test_alloc_unr_specific(uh, i, a); if (verbose) print_unrhdr(uh); check_unrhdr(uh, __LINE__); } for (i = 0; i < (u_int)count; i++) { if (a[i]) { if (verbose) { printf("C %u\n", i); print_unrhdr(uh); } free_unr(uh, i); } } print_unrhdr(uh); delete_unrhdr(uh); free(a); return (0); } #endif Index: head/sys/kern/subr_vmem.c =================================================================== --- head/sys/kern/subr_vmem.c (revision 326270) +++ head/sys/kern/subr_vmem.c (revision 326271) @@ -1,1586 +1,1588 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, * Copyright (c) 2013 EMC Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * From: * $NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $ * $NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $ */ /* * reference: * - Magazines and Vmem: Extending the Slab Allocator * to Many CPUs and Arbitrary Resources * http://www.usenix.org/event/usenix01/bonwick.html */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #define VMEM_OPTORDER 5 #define VMEM_OPTVALUE (1 << VMEM_OPTORDER) #define VMEM_MAXORDER \ (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER) #define VMEM_HASHSIZE_MIN 16 #define VMEM_HASHSIZE_MAX 131072 #define VMEM_QCACHE_IDX_MAX 16 #define VMEM_FITMASK (M_BESTFIT | M_FIRSTFIT) #define VMEM_FLAGS \ (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT) #define BT_FLAGS (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM) #define QC_NAME_MAX 16 /* * Data structures private to vmem. */ MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures"); typedef struct vmem_btag bt_t; TAILQ_HEAD(vmem_seglist, vmem_btag); LIST_HEAD(vmem_freelist, vmem_btag); LIST_HEAD(vmem_hashlist, vmem_btag); struct qcache { uma_zone_t qc_cache; vmem_t *qc_vmem; vmem_size_t qc_size; char qc_name[QC_NAME_MAX]; }; typedef struct qcache qcache_t; #define QC_POOL_TO_QCACHE(pool) ((qcache_t *)(pool->pr_qcache)) #define VMEM_NAME_MAX 16 /* vmem arena */ struct vmem { struct mtx_padalign vm_lock; struct cv vm_cv; char vm_name[VMEM_NAME_MAX+1]; LIST_ENTRY(vmem) vm_alllist; struct vmem_hashlist vm_hash0[VMEM_HASHSIZE_MIN]; struct vmem_freelist vm_freelist[VMEM_MAXORDER]; struct vmem_seglist vm_seglist; struct vmem_hashlist *vm_hashlist; vmem_size_t vm_hashsize; /* Constant after init */ vmem_size_t vm_qcache_max; vmem_size_t vm_quantum_mask; vmem_size_t vm_import_quantum; int vm_quantum_shift; /* Written on alloc/free */ LIST_HEAD(, vmem_btag) vm_freetags; int vm_nfreetags; int vm_nbusytag; vmem_size_t vm_inuse; vmem_size_t vm_size; /* Used on import. */ vmem_import_t *vm_importfn; vmem_release_t *vm_releasefn; void *vm_arg; /* Space exhaustion callback. */ vmem_reclaim_t *vm_reclaimfn; /* quantum cache */ qcache_t vm_qcache[VMEM_QCACHE_IDX_MAX]; }; /* boundary tag */ struct vmem_btag { TAILQ_ENTRY(vmem_btag) bt_seglist; union { LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */ LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */ } bt_u; #define bt_hashlist bt_u.u_hashlist #define bt_freelist bt_u.u_freelist vmem_addr_t bt_start; vmem_size_t bt_size; int bt_type; }; #define BT_TYPE_SPAN 1 /* Allocated from importfn */ #define BT_TYPE_SPAN_STATIC 2 /* vmem_add() or create. */ #define BT_TYPE_FREE 3 /* Available space. */ #define BT_TYPE_BUSY 4 /* Used space. */ #define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC) #define BT_END(bt) ((bt)->bt_start + (bt)->bt_size - 1) #if defined(DIAGNOSTIC) static int enable_vmem_check = 1; SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN, &enable_vmem_check, 0, "Enable vmem check"); static void vmem_check(vmem_t *); #endif static struct callout vmem_periodic_ch; static int vmem_periodic_interval; static struct task vmem_periodic_wk; static struct mtx_padalign __exclusive_cache_line vmem_list_lock; static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list); /* ---- misc */ #define VMEM_CONDVAR_INIT(vm, wchan) cv_init(&vm->vm_cv, wchan) #define VMEM_CONDVAR_DESTROY(vm) cv_destroy(&vm->vm_cv) #define VMEM_CONDVAR_WAIT(vm) cv_wait(&vm->vm_cv, &vm->vm_lock) #define VMEM_CONDVAR_BROADCAST(vm) cv_broadcast(&vm->vm_cv) #define VMEM_LOCK(vm) mtx_lock(&vm->vm_lock) #define VMEM_TRYLOCK(vm) mtx_trylock(&vm->vm_lock) #define VMEM_UNLOCK(vm) mtx_unlock(&vm->vm_lock) #define VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF) #define VMEM_LOCK_DESTROY(vm) mtx_destroy(&vm->vm_lock) #define VMEM_ASSERT_LOCKED(vm) mtx_assert(&vm->vm_lock, MA_OWNED); #define VMEM_ALIGNUP(addr, align) (-(-(addr) & -(align))) #define VMEM_CROSS_P(addr1, addr2, boundary) \ ((((addr1) ^ (addr2)) & -(boundary)) != 0) #define ORDER2SIZE(order) ((order) < VMEM_OPTVALUE ? ((order) + 1) : \ (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1))) #define SIZE2ORDER(size) ((size) <= VMEM_OPTVALUE ? ((size) - 1) : \ (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2))) /* * Maximum number of boundary tags that may be required to satisfy an * allocation. Two may be required to import. Another two may be * required to clip edges. */ #define BT_MAXALLOC 4 /* * Max free limits the number of locally cached boundary tags. We * just want to avoid hitting the zone allocator for every call. */ #define BT_MAXFREE (BT_MAXALLOC * 8) /* Allocator for boundary tags. */ static uma_zone_t vmem_bt_zone; /* boot time arena storage. */ static struct vmem kernel_arena_storage; static struct vmem kmem_arena_storage; static struct vmem buffer_arena_storage; static struct vmem transient_arena_storage; vmem_t *kernel_arena = &kernel_arena_storage; vmem_t *kmem_arena = &kmem_arena_storage; vmem_t *buffer_arena = &buffer_arena_storage; vmem_t *transient_arena = &transient_arena_storage; #ifdef DEBUG_MEMGUARD static struct vmem memguard_arena_storage; vmem_t *memguard_arena = &memguard_arena_storage; #endif /* * Fill the vmem's boundary tag cache. We guarantee that boundary tag * allocation will not fail once bt_fill() passes. To do so we cache * at least the maximum possible tag allocations in the arena. */ static int bt_fill(vmem_t *vm, int flags) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); /* * Only allow the kmem arena to dip into reserve tags. It is the * vmem where new tags come from. */ flags &= BT_FLAGS; if (vm != kmem_arena) flags &= ~M_USE_RESERVE; /* * Loop until we meet the reserve. To minimize the lock shuffle * and prevent simultaneous fills we first try a NOWAIT regardless * of the caller's flags. Specify M_NOVM so we don't recurse while * holding a vmem lock. */ while (vm->vm_nfreetags < BT_MAXALLOC) { bt = uma_zalloc(vmem_bt_zone, (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM); if (bt == NULL) { VMEM_UNLOCK(vm); bt = uma_zalloc(vmem_bt_zone, flags); VMEM_LOCK(vm); if (bt == NULL && (flags & M_NOWAIT) != 0) break; } LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } if (vm->vm_nfreetags < BT_MAXALLOC) return ENOMEM; return 0; } /* * Pop a tag off of the freetag stack. */ static bt_t * bt_alloc(vmem_t *vm) { bt_t *bt; VMEM_ASSERT_LOCKED(vm); bt = LIST_FIRST(&vm->vm_freetags); MPASS(bt != NULL); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; return bt; } /* * Trim the per-vmem free list. Returns with the lock released to * avoid allocator recursions. */ static void bt_freetrim(vmem_t *vm, int freelimit) { LIST_HEAD(, vmem_btag) freetags; bt_t *bt; LIST_INIT(&freetags); VMEM_ASSERT_LOCKED(vm); while (vm->vm_nfreetags > freelimit) { bt = LIST_FIRST(&vm->vm_freetags); LIST_REMOVE(bt, bt_freelist); vm->vm_nfreetags--; LIST_INSERT_HEAD(&freetags, bt, bt_freelist); } VMEM_UNLOCK(vm); while ((bt = LIST_FIRST(&freetags)) != NULL) { LIST_REMOVE(bt, bt_freelist); uma_zfree(vmem_bt_zone, bt); } } static inline void bt_free(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); MPASS(LIST_FIRST(&vm->vm_freetags) != bt); LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); vm->vm_nfreetags++; } /* * freelist[0] ... [1, 1] * freelist[1] ... [2, 2] * : * freelist[29] ... [30, 30] * freelist[30] ... [31, 31] * freelist[31] ... [32, 63] * freelist[33] ... [64, 127] * : * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1] * : */ static struct vmem_freelist * bt_freehead_tofree(vmem_t *vm, vmem_size_t size) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; const int idx = SIZE2ORDER(qsize); MPASS(size != 0 && qsize != 0); MPASS((size & vm->vm_quantum_mask) == 0); MPASS(idx >= 0); MPASS(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* * bt_freehead_toalloc: return the freelist for the given size and allocation * strategy. * * For M_FIRSTFIT, return the list in which any blocks are large enough * for the requested size. otherwise, return the list which can have blocks * large enough for the requested size. */ static struct vmem_freelist * bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat) { const vmem_size_t qsize = size >> vm->vm_quantum_shift; int idx = SIZE2ORDER(qsize); MPASS(size != 0 && qsize != 0); MPASS((size & vm->vm_quantum_mask) == 0); if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) { idx++; /* check too large request? */ } MPASS(idx >= 0); MPASS(idx < VMEM_MAXORDER); return &vm->vm_freelist[idx]; } /* ---- boundary tag hash */ static struct vmem_hashlist * bt_hashhead(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; unsigned int hash; hash = hash32_buf(&addr, sizeof(addr), 0); list = &vm->vm_hashlist[hash % vm->vm_hashsize]; return list; } static bt_t * bt_lookupbusy(vmem_t *vm, vmem_addr_t addr) { struct vmem_hashlist *list; bt_t *bt; VMEM_ASSERT_LOCKED(vm); list = bt_hashhead(vm, addr); LIST_FOREACH(bt, list, bt_hashlist) { if (bt->bt_start == addr) { break; } } return bt; } static void bt_rembusy(vmem_t *vm, bt_t *bt) { VMEM_ASSERT_LOCKED(vm); MPASS(vm->vm_nbusytag > 0); vm->vm_inuse -= bt->bt_size; vm->vm_nbusytag--; LIST_REMOVE(bt, bt_hashlist); } static void bt_insbusy(vmem_t *vm, bt_t *bt) { struct vmem_hashlist *list; VMEM_ASSERT_LOCKED(vm); MPASS(bt->bt_type == BT_TYPE_BUSY); list = bt_hashhead(vm, bt->bt_start); LIST_INSERT_HEAD(list, bt, bt_hashlist); vm->vm_nbusytag++; vm->vm_inuse += bt->bt_size; } /* ---- boundary tag list */ static void bt_remseg(vmem_t *vm, bt_t *bt) { TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist); bt_free(vm, bt); } static void bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev) { TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist); } static void bt_insseg_tail(vmem_t *vm, bt_t *bt) { TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist); } static void bt_remfree(vmem_t *vm, bt_t *bt) { MPASS(bt->bt_type == BT_TYPE_FREE); LIST_REMOVE(bt, bt_freelist); } static void bt_insfree(vmem_t *vm, bt_t *bt) { struct vmem_freelist *list; list = bt_freehead_tofree(vm, bt->bt_size); LIST_INSERT_HEAD(list, bt, bt_freelist); } /* ---- vmem internal functions */ /* * Import from the arena into the quantum cache in UMA. */ static int qc_import(void *arg, void **store, int cnt, int flags) { qcache_t *qc; vmem_addr_t addr; int i; qc = arg; if ((flags & VMEM_FITMASK) == 0) flags |= M_BESTFIT; for (i = 0; i < cnt; i++) { if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0) break; store[i] = (void *)addr; /* Only guarantee one allocation. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } return i; } /* * Release memory from the UMA cache to the arena. */ static void qc_release(void *arg, void **store, int cnt) { qcache_t *qc; int i; qc = arg; for (i = 0; i < cnt; i++) vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size); } static void qc_init(vmem_t *vm, vmem_size_t qcache_max) { qcache_t *qc; vmem_size_t size; int qcache_idx_max; int i; MPASS((qcache_max & vm->vm_quantum_mask) == 0); qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift, VMEM_QCACHE_IDX_MAX); vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) { qc = &vm->vm_qcache[i]; size = (i + 1) << vm->vm_quantum_shift; snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu", vm->vm_name, size); qc->qc_vmem = vm; qc->qc_size = size; qc->qc_cache = uma_zcache_create(qc->qc_name, size, NULL, NULL, NULL, NULL, qc_import, qc_release, qc, UMA_ZONE_VM); MPASS(qc->qc_cache); } } static void qc_destroy(vmem_t *vm) { int qcache_idx_max; int i; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) uma_zdestroy(vm->vm_qcache[i].qc_cache); } static void qc_drain(vmem_t *vm) { int qcache_idx_max; int i; qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) zone_drain(vm->vm_qcache[i].qc_cache); } #ifndef UMA_MD_SMALL_ALLOC static struct mtx_padalign __exclusive_cache_line vmem_bt_lock; /* * vmem_bt_alloc: Allocate a new page of boundary tags. * * On architectures with uma_small_alloc there is no recursion; no address * space need be allocated to allocate boundary tags. For the others, we * must handle recursion. Boundary tags are necessary to allocate new * boundary tags. * * UMA guarantees that enough tags are held in reserve to allocate a new * page of kva. We dip into this reserve by specifying M_USE_RESERVE only * when allocating the page to hold new boundary tags. In this way the * reserve is automatically filled by the allocation that uses the reserve. * * We still have to guarantee that the new tags are allocated atomically since * many threads may try concurrently. The bt_lock provides this guarantee. * We convert WAITOK allocations to NOWAIT and then handle the blocking here * on failure. It's ok to return NULL for a WAITOK allocation as UMA will * loop again after checking to see if we lost the race to allocate. * * There is a small race between vmem_bt_alloc() returning the page and the * zone lock being acquired to add the page to the zone. For WAITOK * allocations we just pause briefly. NOWAIT may experience a transient * failure. To alleviate this we permit a small number of simultaneous * fills to proceed concurrently so NOWAIT is less likely to fail unless * we are really out of KVA. */ static void * vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait) { vmem_addr_t addr; *pflag = UMA_SLAB_KMEM; /* * Single thread boundary tag allocation so that the address space * and memory are added in one atomic operation. */ mtx_lock(&vmem_bt_lock); if (vmem_xalloc(kmem_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT, &addr) == 0) { if (kmem_back(kmem_object, addr, bytes, M_NOWAIT | M_USE_RESERVE) == 0) { mtx_unlock(&vmem_bt_lock); return ((void *)addr); } vmem_xfree(kmem_arena, addr, bytes); mtx_unlock(&vmem_bt_lock); /* * Out of memory, not address space. This may not even be * possible due to M_USE_RESERVE page allocation. */ if (wait & M_WAITOK) VM_WAIT; return (NULL); } mtx_unlock(&vmem_bt_lock); /* * We're either out of address space or lost a fill race. */ if (wait & M_WAITOK) pause("btalloc", 1); return (NULL); } #endif void vmem_startup(void) { mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF); vmem_bt_zone = uma_zcreate("vmem btag", sizeof(struct vmem_btag), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); #ifndef UMA_MD_SMALL_ALLOC mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF); uma_prealloc(vmem_bt_zone, BT_MAXALLOC); /* * Reserve enough tags to allocate new tags. We allow multiple * CPUs to attempt to allocate new tags concurrently to limit * false restarts in UMA. */ uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2); uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc); #endif } /* ---- rehash */ static int vmem_rehash(vmem_t *vm, vmem_size_t newhashsize) { bt_t *bt; int i; struct vmem_hashlist *newhashlist; struct vmem_hashlist *oldhashlist; vmem_size_t oldhashsize; MPASS(newhashsize > 0); newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize, M_VMEM, M_NOWAIT); if (newhashlist == NULL) return ENOMEM; for (i = 0; i < newhashsize; i++) { LIST_INIT(&newhashlist[i]); } VMEM_LOCK(vm); oldhashlist = vm->vm_hashlist; oldhashsize = vm->vm_hashsize; vm->vm_hashlist = newhashlist; vm->vm_hashsize = newhashsize; if (oldhashlist == NULL) { VMEM_UNLOCK(vm); return 0; } for (i = 0; i < oldhashsize; i++) { while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) { bt_rembusy(vm, bt); bt_insbusy(vm, bt); } } VMEM_UNLOCK(vm); if (oldhashlist != vm->vm_hash0) { free(oldhashlist, M_VMEM); } return 0; } static void vmem_periodic_kick(void *dummy) { taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk); } static void vmem_periodic(void *unused, int pending) { vmem_t *vm; vmem_size_t desired; vmem_size_t current; mtx_lock(&vmem_list_lock); LIST_FOREACH(vm, &vmem_list, vm_alllist) { #ifdef DIAGNOSTIC /* Convenient time to verify vmem state. */ if (enable_vmem_check == 1) { VMEM_LOCK(vm); vmem_check(vm); VMEM_UNLOCK(vm); } #endif desired = 1 << flsl(vm->vm_nbusytag); desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN), VMEM_HASHSIZE_MAX); current = vm->vm_hashsize; /* Grow in powers of two. Shrink less aggressively. */ if (desired >= current * 2 || desired * 4 <= current) vmem_rehash(vm, desired); /* * Periodically wake up threads waiting for resources, * so they could ask for reclamation again. */ VMEM_CONDVAR_BROADCAST(vm); } mtx_unlock(&vmem_list_lock); callout_reset(&vmem_periodic_ch, vmem_periodic_interval, vmem_periodic_kick, NULL); } static void vmem_start_callout(void *unused) { TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL); vmem_periodic_interval = hz * 10; callout_init(&vmem_periodic_ch, 1); callout_reset(&vmem_periodic_ch, vmem_periodic_interval, vmem_periodic_kick, NULL); } SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL); static void vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type) { bt_t *btspan; bt_t *btfree; MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC); MPASS((size & vm->vm_quantum_mask) == 0); btspan = bt_alloc(vm); btspan->bt_type = type; btspan->bt_start = addr; btspan->bt_size = size; bt_insseg_tail(vm, btspan); btfree = bt_alloc(vm); btfree->bt_type = BT_TYPE_FREE; btfree->bt_start = addr; btfree->bt_size = size; bt_insseg(vm, btfree, btspan); bt_insfree(vm, btfree); vm->vm_size += size; } static void vmem_destroy1(vmem_t *vm) { bt_t *bt; /* * Drain per-cpu quantum caches. */ qc_destroy(vm); /* * The vmem should now only contain empty segments. */ VMEM_LOCK(vm); MPASS(vm->vm_nbusytag == 0); while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL) bt_remseg(vm, bt); if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0) free(vm->vm_hashlist, M_VMEM); bt_freetrim(vm, 0); VMEM_CONDVAR_DESTROY(vm); VMEM_LOCK_DESTROY(vm); free(vm, M_VMEM); } static int vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags) { vmem_addr_t addr; int error; if (vm->vm_importfn == NULL) return EINVAL; /* * To make sure we get a span that meets the alignment we double it * and add the size to the tail. This slightly overestimates. */ if (align != vm->vm_quantum_mask + 1) size = (align * 2) + size; size = roundup(size, vm->vm_import_quantum); /* * Hide MAXALLOC tags so we're guaranteed to be able to add this * span and the tag we want to allocate from it. */ MPASS(vm->vm_nfreetags >= BT_MAXALLOC); vm->vm_nfreetags -= BT_MAXALLOC; VMEM_UNLOCK(vm); error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr); VMEM_LOCK(vm); vm->vm_nfreetags += BT_MAXALLOC; if (error) return ENOMEM; vmem_add1(vm, addr, size, BT_TYPE_SPAN); return 0; } /* * vmem_fit: check if a bt can satisfy the given restrictions. * * it's a caller's responsibility to ensure the region is big enough * before calling us. */ static int vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align, vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp) { vmem_addr_t start; vmem_addr_t end; MPASS(size > 0); MPASS(bt->bt_size >= size); /* caller's responsibility */ /* * XXX assumption: vmem_addr_t and vmem_size_t are * unsigned integer of the same size. */ start = bt->bt_start; if (start < minaddr) { start = minaddr; } end = BT_END(bt); if (end > maxaddr) end = maxaddr; if (start > end) return (ENOMEM); start = VMEM_ALIGNUP(start - phase, align) + phase; if (start < bt->bt_start) start += align; if (VMEM_CROSS_P(start, start + size - 1, nocross)) { MPASS(align < nocross); start = VMEM_ALIGNUP(start - phase, nocross) + phase; } if (start <= end && end - start >= size - 1) { MPASS((start & (align - 1)) == phase); MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross)); MPASS(minaddr <= start); MPASS(maxaddr == 0 || start + size - 1 <= maxaddr); MPASS(bt->bt_start <= start); MPASS(BT_END(bt) - start >= size - 1); *addrp = start; return (0); } return (ENOMEM); } /* * vmem_clip: Trim the boundary tag edges to the requested start and size. */ static void vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size) { bt_t *btnew; bt_t *btprev; VMEM_ASSERT_LOCKED(vm); MPASS(bt->bt_type == BT_TYPE_FREE); MPASS(bt->bt_size >= size); bt_remfree(vm, bt); if (bt->bt_start != start) { btprev = bt_alloc(vm); btprev->bt_type = BT_TYPE_FREE; btprev->bt_start = bt->bt_start; btprev->bt_size = start - bt->bt_start; bt->bt_start = start; bt->bt_size -= btprev->bt_size; bt_insfree(vm, btprev); bt_insseg(vm, btprev, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); } MPASS(bt->bt_start == start); if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) { /* split */ btnew = bt_alloc(vm); btnew->bt_type = BT_TYPE_BUSY; btnew->bt_start = bt->bt_start; btnew->bt_size = size; bt->bt_start = bt->bt_start + size; bt->bt_size -= size; bt_insfree(vm, bt); bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); bt_insbusy(vm, btnew); bt = btnew; } else { bt->bt_type = BT_TYPE_BUSY; bt_insbusy(vm, bt); } MPASS(bt->bt_size >= size); bt->bt_type = BT_TYPE_BUSY; } /* ---- vmem API */ void vmem_set_import(vmem_t *vm, vmem_import_t *importfn, vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum) { VMEM_LOCK(vm); vm->vm_importfn = importfn; vm->vm_releasefn = releasefn; vm->vm_arg = arg; vm->vm_import_quantum = import_quantum; VMEM_UNLOCK(vm); } void vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn) { VMEM_LOCK(vm); vm->vm_reclaimfn = reclaimfn; VMEM_UNLOCK(vm); } /* * vmem_init: Initializes vmem arena. */ vmem_t * vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags) { int i; MPASS(quantum > 0); MPASS((quantum & (quantum - 1)) == 0); bzero(vm, sizeof(*vm)); VMEM_CONDVAR_INIT(vm, name); VMEM_LOCK_INIT(vm, name); vm->vm_nfreetags = 0; LIST_INIT(&vm->vm_freetags); strlcpy(vm->vm_name, name, sizeof(vm->vm_name)); vm->vm_quantum_mask = quantum - 1; vm->vm_quantum_shift = flsl(quantum) - 1; vm->vm_nbusytag = 0; vm->vm_size = 0; vm->vm_inuse = 0; qc_init(vm, qcache_max); TAILQ_INIT(&vm->vm_seglist); for (i = 0; i < VMEM_MAXORDER; i++) { LIST_INIT(&vm->vm_freelist[i]); } memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0)); vm->vm_hashsize = VMEM_HASHSIZE_MIN; vm->vm_hashlist = vm->vm_hash0; if (size != 0) { if (vmem_add(vm, base, size, flags) != 0) { vmem_destroy1(vm); return NULL; } } mtx_lock(&vmem_list_lock); LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist); mtx_unlock(&vmem_list_lock); return vm; } /* * vmem_create: create an arena. */ vmem_t * vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags) { vmem_t *vm; vm = malloc(sizeof(*vm), M_VMEM, flags & (M_WAITOK|M_NOWAIT)); if (vm == NULL) return (NULL); if (vmem_init(vm, name, base, size, quantum, qcache_max, flags) == NULL) return (NULL); return (vm); } void vmem_destroy(vmem_t *vm) { mtx_lock(&vmem_list_lock); LIST_REMOVE(vm, vm_alllist); mtx_unlock(&vmem_list_lock); vmem_destroy1(vm); } vmem_size_t vmem_roundup_size(vmem_t *vm, vmem_size_t size) { return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask; } /* * vmem_alloc: allocate resource from the arena. */ int vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp) { const int strat __unused = flags & VMEM_FITMASK; qcache_t *qc; flags &= VMEM_FLAGS; MPASS(size > 0); MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT); if ((flags & M_NOWAIT) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc"); if (size <= vm->vm_qcache_max) { qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; *addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags); if (*addrp == 0) return (ENOMEM); return (0); } return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp); } int vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align, const vmem_size_t phase, const vmem_size_t nocross, const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags, vmem_addr_t *addrp) { const vmem_size_t size = vmem_roundup_size(vm, size0); struct vmem_freelist *list; struct vmem_freelist *first; struct vmem_freelist *end; vmem_size_t avail; bt_t *bt; int error; int strat; flags &= VMEM_FLAGS; strat = flags & VMEM_FITMASK; MPASS(size0 > 0); MPASS(size > 0); MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT); MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK)); if ((flags & M_NOWAIT) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc"); MPASS((align & vm->vm_quantum_mask) == 0); MPASS((align & (align - 1)) == 0); MPASS((phase & vm->vm_quantum_mask) == 0); MPASS((nocross & vm->vm_quantum_mask) == 0); MPASS((nocross & (nocross - 1)) == 0); MPASS((align == 0 && phase == 0) || phase < align); MPASS(nocross == 0 || nocross >= size); MPASS(minaddr <= maxaddr); MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross)); if (align == 0) align = vm->vm_quantum_mask + 1; *addrp = 0; end = &vm->vm_freelist[VMEM_MAXORDER]; /* * choose a free block from which we allocate. */ first = bt_freehead_toalloc(vm, size, strat); VMEM_LOCK(vm); for (;;) { /* * Make sure we have enough tags to complete the * operation. */ if (vm->vm_nfreetags < BT_MAXALLOC && bt_fill(vm, flags) != 0) { error = ENOMEM; break; } /* * Scan freelists looking for a tag that satisfies the * allocation. If we're doing BESTFIT we may encounter * sizes below the request. If we're doing FIRSTFIT we * inspect only the first element from each list. */ for (list = first; list < end; list++) { LIST_FOREACH(bt, list, bt_freelist) { if (bt->bt_size >= size) { error = vmem_fit(bt, size, align, phase, nocross, minaddr, maxaddr, addrp); if (error == 0) { vmem_clip(vm, bt, *addrp, size); goto out; } } /* FIRST skips to the next list. */ if (strat == M_FIRSTFIT) break; } } /* * Retry if the fast algorithm failed. */ if (strat == M_FIRSTFIT) { strat = M_BESTFIT; first = bt_freehead_toalloc(vm, size, strat); continue; } /* * XXX it is possible to fail to meet restrictions with the * imported region. It is up to the user to specify the * import quantum such that it can satisfy any allocation. */ if (vmem_import(vm, size, align, flags) == 0) continue; /* * Try to free some space from the quantum cache or reclaim * functions if available. */ if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) { avail = vm->vm_size - vm->vm_inuse; VMEM_UNLOCK(vm); if (vm->vm_qcache_max != 0) qc_drain(vm); if (vm->vm_reclaimfn != NULL) vm->vm_reclaimfn(vm, flags); VMEM_LOCK(vm); /* If we were successful retry even NOWAIT. */ if (vm->vm_size - vm->vm_inuse > avail) continue; } if ((flags & M_NOWAIT) != 0) { error = ENOMEM; break; } VMEM_CONDVAR_WAIT(vm); } out: VMEM_UNLOCK(vm); if (error != 0 && (flags & M_NOWAIT) == 0) panic("failed to allocate waiting allocation\n"); return (error); } /* * vmem_free: free the resource to the arena. */ void vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { qcache_t *qc; MPASS(size > 0); if (size <= vm->vm_qcache_max) { qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; uma_zfree(qc->qc_cache, (void *)addr); } else vmem_xfree(vm, addr, size); } void vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) { bt_t *bt; bt_t *t; MPASS(size > 0); VMEM_LOCK(vm); bt = bt_lookupbusy(vm, addr); MPASS(bt != NULL); MPASS(bt->bt_start == addr); MPASS(bt->bt_size == vmem_roundup_size(vm, size) || bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask); MPASS(bt->bt_type == BT_TYPE_BUSY); bt_rembusy(vm, bt); bt->bt_type = BT_TYPE_FREE; /* coalesce */ t = TAILQ_NEXT(bt, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { MPASS(BT_END(bt) < t->bt_start); /* YYY */ bt->bt_size += t->bt_size; bt_remfree(vm, t); bt_remseg(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); if (t != NULL && t->bt_type == BT_TYPE_FREE) { MPASS(BT_END(t) < bt->bt_start); /* YYY */ bt->bt_size += t->bt_size; bt->bt_start = t->bt_start; bt_remfree(vm, t); bt_remseg(vm, t); } t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); MPASS(t != NULL); MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY); if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN && t->bt_size == bt->bt_size) { vmem_addr_t spanaddr; vmem_size_t spansize; MPASS(t->bt_start == bt->bt_start); spanaddr = bt->bt_start; spansize = bt->bt_size; bt_remseg(vm, bt); bt_remseg(vm, t); vm->vm_size -= spansize; VMEM_CONDVAR_BROADCAST(vm); bt_freetrim(vm, BT_MAXFREE); (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize); } else { bt_insfree(vm, bt); VMEM_CONDVAR_BROADCAST(vm); bt_freetrim(vm, BT_MAXFREE); } } /* * vmem_add: * */ int vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags) { int error; error = 0; flags &= VMEM_FLAGS; VMEM_LOCK(vm); if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0) vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC); else error = ENOMEM; VMEM_UNLOCK(vm); return (error); } /* * vmem_size: information about arenas size */ vmem_size_t vmem_size(vmem_t *vm, int typemask) { int i; switch (typemask) { case VMEM_ALLOC: return vm->vm_inuse; case VMEM_FREE: return vm->vm_size - vm->vm_inuse; case VMEM_FREE|VMEM_ALLOC: return vm->vm_size; case VMEM_MAXFREE: VMEM_LOCK(vm); for (i = VMEM_MAXORDER - 1; i >= 0; i--) { if (LIST_EMPTY(&vm->vm_freelist[i])) continue; VMEM_UNLOCK(vm); return ((vmem_size_t)ORDER2SIZE(i) << vm->vm_quantum_shift); } VMEM_UNLOCK(vm); return (0); default: panic("vmem_size"); } } /* ---- debug */ #if defined(DDB) || defined(DIAGNOSTIC) static void bt_dump(const bt_t *, int (*)(const char *, ...) __printflike(1, 2)); static const char * bt_type_string(int type) { switch (type) { case BT_TYPE_BUSY: return "busy"; case BT_TYPE_FREE: return "free"; case BT_TYPE_SPAN: return "span"; case BT_TYPE_SPAN_STATIC: return "static span"; default: break; } return "BOGUS"; } static void bt_dump(const bt_t *bt, int (*pr)(const char *, ...)) { (*pr)("\t%p: %jx %jx, %d(%s)\n", bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size, bt->bt_type, bt_type_string(bt->bt_type)); } static void vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2)) { const bt_t *bt; int i; (*pr)("vmem %p '%s'\n", vm, vm->vm_name); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { bt_dump(bt, pr); } for (i = 0; i < VMEM_MAXORDER; i++) { const struct vmem_freelist *fl = &vm->vm_freelist[i]; if (LIST_EMPTY(fl)) { continue; } (*pr)("freelist[%d]\n", i); LIST_FOREACH(bt, fl, bt_freelist) { bt_dump(bt, pr); } } } #endif /* defined(DDB) || defined(DIAGNOSTIC) */ #if defined(DDB) #include static bt_t * vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr) { bt_t *bt; TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (BT_ISSPAN_P(bt)) { continue; } if (bt->bt_start <= addr && addr <= BT_END(bt)) { return bt; } } return NULL; } void vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...)) { vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { bt_t *bt; bt = vmem_whatis_lookup(vm, addr); if (bt == NULL) { continue; } (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n", (void *)addr, (void *)bt->bt_start, (vmem_size_t)(addr - bt->bt_start), vm->vm_name, (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free"); } } void vmem_printall(const char *modif, int (*pr)(const char *, ...)) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) { vmem_dump(vm, pr); } } void vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...)) { const vmem_t *vm = (const void *)addr; vmem_dump(vm, pr); } DB_SHOW_COMMAND(vmemdump, vmemdump) { if (!have_addr) { db_printf("usage: show vmemdump \n"); return; } vmem_dump((const vmem_t *)addr, db_printf); } DB_SHOW_ALL_COMMAND(vmemdump, vmemdumpall) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) vmem_dump(vm, db_printf); } DB_SHOW_COMMAND(vmem, vmem_summ) { const vmem_t *vm = (const void *)addr; const bt_t *bt; size_t ft[VMEM_MAXORDER], ut[VMEM_MAXORDER]; size_t fs[VMEM_MAXORDER], us[VMEM_MAXORDER]; int ord; if (!have_addr) { db_printf("usage: show vmem \n"); return; } db_printf("vmem %p '%s'\n", vm, vm->vm_name); db_printf("\tquantum:\t%zu\n", vm->vm_quantum_mask + 1); db_printf("\tsize:\t%zu\n", vm->vm_size); db_printf("\tinuse:\t%zu\n", vm->vm_inuse); db_printf("\tfree:\t%zu\n", vm->vm_size - vm->vm_inuse); db_printf("\tbusy tags:\t%d\n", vm->vm_nbusytag); db_printf("\tfree tags:\t%d\n", vm->vm_nfreetags); memset(&ft, 0, sizeof(ft)); memset(&ut, 0, sizeof(ut)); memset(&fs, 0, sizeof(fs)); memset(&us, 0, sizeof(us)); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { ord = SIZE2ORDER(bt->bt_size >> vm->vm_quantum_shift); if (bt->bt_type == BT_TYPE_BUSY) { ut[ord]++; us[ord] += bt->bt_size; } else if (bt->bt_type == BT_TYPE_FREE) { ft[ord]++; fs[ord] += bt->bt_size; } } db_printf("\t\t\tinuse\tsize\t\tfree\tsize\n"); for (ord = 0; ord < VMEM_MAXORDER; ord++) { if (ut[ord] == 0 && ft[ord] == 0) continue; db_printf("\t%-15zu %zu\t%-15zu %zu\t%-16zu\n", ORDER2SIZE(ord) << vm->vm_quantum_shift, ut[ord], us[ord], ft[ord], fs[ord]); } } DB_SHOW_ALL_COMMAND(vmem, vmem_summall) { const vmem_t *vm; LIST_FOREACH(vm, &vmem_list, vm_alllist) vmem_summ((db_expr_t)vm, TRUE, count, modif); } #endif /* defined(DDB) */ #define vmem_printf printf #if defined(DIAGNOSTIC) static bool vmem_check_sanity(vmem_t *vm) { const bt_t *bt, *bt2; MPASS(vm != NULL); TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { if (bt->bt_start > BT_END(bt)) { printf("corrupted tag\n"); bt_dump(bt, vmem_printf); return false; } } TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) { if (bt == bt2) { continue; } if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) { continue; } if (bt->bt_start <= BT_END(bt2) && bt2->bt_start <= BT_END(bt)) { printf("overwrapped tags\n"); bt_dump(bt, vmem_printf); bt_dump(bt2, vmem_printf); return false; } } } return true; } static void vmem_check(vmem_t *vm) { if (!vmem_check_sanity(vm)) { panic("insanity vmem %p", vm); } } #endif /* defined(DIAGNOSTIC) */ Index: head/sys/kern/subr_witness.c =================================================================== --- head/sys/kern/subr_witness.c (revision 326270) +++ head/sys/kern/subr_witness.c (revision 326271) @@ -1,3056 +1,3058 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2008 Isilon Systems, Inc. * Copyright (c) 2008 Ilya Maykov * Copyright (c) 1998 Berkeley Software Design, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of the `witness' lock verifier. Originally implemented for * mutexes in BSD/OS. Extended to handle generic lock objects and lock * classes in FreeBSD. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ /* * Special rules concerning Giant and lock orders: * * 1) Giant must be acquired before any other mutexes. Stated another way, * no other mutex may be held when Giant is acquired. * * 2) Giant must be released when blocking on a sleepable lock. * * This rule is less obvious, but is a result of Giant providing the same * semantics as spl(). Basically, when a thread sleeps, it must release * Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule * 2). * * 3) Giant may be acquired before or after sleepable locks. * * This rule is also not quite as obvious. Giant may be acquired after * a sleepable lock because it is a non-sleepable lock and non-sleepable * locks may always be acquired while holding a sleepable lock. The second * case, Giant before a sleepable lock, follows from rule 2) above. Suppose * you have two threads T1 and T2 and a sleepable lock X. Suppose that T1 * acquires X and blocks on Giant. Then suppose that T2 acquires Giant and * blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to * execute. Thus, acquiring Giant both before and after a sleepable lock * will not result in a lock order reversal. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_stack.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #if !defined(DDB) && !defined(STACK) #error "DDB or STACK options are required for WITNESS" #endif /* Note that these traces do not work with KTR_ALQ. */ #if 0 #define KTR_WITNESS KTR_SUBSYS #else #define KTR_WITNESS 0 #endif #define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */ #define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */ #define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */ /* Define this to check for blessed mutexes */ #undef BLESSING #ifndef WITNESS_COUNT #define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ #define WITNESS_PENDLIST (2048 + MAXCPU) /* Allocate 256 KB of stack data space */ #define WITNESS_LO_DATA_COUNT 2048 /* Prime, gives load factor of ~2 at full load */ #define WITNESS_LO_HASH_SIZE 1021 /* * XXX: This is somewhat bogus, as we assume here that at most 2048 threads * will hold LOCK_NCHILDREN locks. We handle failure ok, and we should * probably be safe for the most part, but it's still a SWAG. */ #define LOCK_NCHILDREN 5 #define LOCK_CHILDCOUNT 2048 #define MAX_W_NAME 64 #define FULLGRAPH_SBUF_SIZE 512 /* * These flags go in the witness relationship matrix and describe the * relationship between any two struct witness objects. */ #define WITNESS_UNRELATED 0x00 /* No lock order relation. */ #define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ #define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ #define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ #define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ #define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) #define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) #define WITNESS_RELATED_MASK \ (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) #define WITNESS_REVERSAL 0x10 /* A lock order reversal has been * observed. */ #define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ #define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ #define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ /* Descendant to ancestor flags */ #define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) /* Ancestor to descendant flags */ #define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2) #define WITNESS_INDEX_ASSERT(i) \ MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count) static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness"); /* * Lock instances. A lock instance is the data associated with a lock while * it is held by witness. For example, a lock instance will hold the * recursion count of a lock. Lock instances are held in lists. Spin locks * are held in a per-cpu list while sleep locks are held in per-thread list. */ struct lock_instance { struct lock_object *li_lock; const char *li_file; int li_line; u_int li_flags; }; /* * A simple list type used to build the list of locks held by a thread * or CPU. We can't simply embed the list in struct lock_object since a * lock may be held by more than one thread if it is a shared lock. Locks * are added to the head of the list, so we fill up each list entry from * "the back" logically. To ease some of the arithmetic, we actually fill * in each list entry the normal way (children[0] then children[1], etc.) but * when we traverse the list we read children[count-1] as the first entry * down to children[0] as the final entry. */ struct lock_list_entry { struct lock_list_entry *ll_next; struct lock_instance ll_children[LOCK_NCHILDREN]; u_int ll_count; }; /* * The main witness structure. One of these per named lock type in the system * (for example, "vnode interlock"). */ struct witness { char w_name[MAX_W_NAME]; uint32_t w_index; /* Index in the relationship matrix */ struct lock_class *w_class; STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ struct witness *w_hash_next; /* Linked list in hash buckets. */ const char *w_file; /* File where last acquired */ uint32_t w_line; /* Line where last acquired */ uint32_t w_refcount; uint16_t w_num_ancestors; /* direct/indirect * ancestor count */ uint16_t w_num_descendants; /* direct/indirect * descendant count */ int16_t w_ddb_level; unsigned w_displayed:1; unsigned w_reversed:1; }; STAILQ_HEAD(witness_list, witness); /* * The witness hash table. Keys are witness names (const char *), elements are * witness objects (struct witness *). */ struct witness_hash { struct witness *wh_array[WITNESS_HASH_SIZE]; uint32_t wh_size; uint32_t wh_count; }; /* * Key type for the lock order data hash table. */ struct witness_lock_order_key { uint16_t from; uint16_t to; }; struct witness_lock_order_data { struct stack wlod_stack; struct witness_lock_order_key wlod_key; struct witness_lock_order_data *wlod_next; }; /* * The witness lock order data hash table. Keys are witness index tuples * (struct witness_lock_order_key), elements are lock order data objects * (struct witness_lock_order_data). */ struct witness_lock_order_hash { struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE]; u_int wloh_size; u_int wloh_count; }; #ifdef BLESSING struct witness_blessed { const char *b_lock1; const char *b_lock2; }; #endif struct witness_pendhelp { const char *wh_type; struct lock_object *wh_lock; }; struct witness_order_list_entry { const char *w_name; struct lock_class *w_class; }; /* * Returns 0 if one of the locks is a spin lock and the other is not. * Returns 1 otherwise. */ static __inline int witness_lock_type_equal(struct witness *w1, struct witness *w2) { return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); } static __inline int witness_lock_order_key_equal(const struct witness_lock_order_key *a, const struct witness_lock_order_key *b) { return (a->from == b->from && a->to == b->to); } static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname); static void adopt(struct witness *parent, struct witness *child); #ifdef BLESSING static int blessed(struct witness *, struct witness *); #endif static void depart(struct witness *w); static struct witness *enroll(const char *description, struct lock_class *lock_class); static struct lock_instance *find_instance(struct lock_list_entry *list, const struct lock_object *lock); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static void itismychild(struct witness *parent, struct witness *child); static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS); static void witness_add_fullgraph(struct sbuf *sb, struct witness *parent); #ifdef DDB static void witness_ddb_compute_levels(void); static void witness_ddb_display(int(*)(const char *fmt, ...)); static void witness_ddb_display_descendants(int(*)(const char *fmt, ...), struct witness *, int indent); static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list); static void witness_ddb_level_descendants(struct witness *parent, int l); static void witness_ddb_list(struct thread *td); #endif static void witness_debugger(int cond, const char *msg); static void witness_free(struct witness *m); static struct witness *witness_get(void); static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size); static struct witness *witness_hash_get(const char *key); static void witness_hash_put(struct witness *w); static void witness_init_hash_tables(void); static void witness_increment_graph_generation(void); static void witness_lock_list_free(struct lock_list_entry *lle); static struct lock_list_entry *witness_lock_list_get(void); static int witness_lock_order_add(struct witness *parent, struct witness *child); static int witness_lock_order_check(struct witness *parent, struct witness *child); static struct witness_lock_order_data *witness_lock_order_get( struct witness *parent, struct witness *child); static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)); static int witness_output(const char *fmt, ...) __printflike(1, 2); static int witness_voutput(const char *fmt, va_list ap) __printflike(1, 0); static void witness_setflag(struct lock_object *lock, int flag, int set); static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL, "Witness Locking"); /* * If set to 0, lock order checking is disabled. If set to -1, * witness is completely disabled. Otherwise witness performs full * lock order checking for all locks. At runtime, lock order checking * may be toggled. However, witness cannot be reenabled once it is * completely disabled. */ static int witness_watch = 1; SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0, sysctl_debug_witness_watch, "I", "witness is watching lock operations"); #ifdef KDB /* * When KDB is enabled and witness_kdb is 1, it will cause the system * to drop into kdebug() when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ #ifdef WITNESS_KDB int witness_kdb = 1; #else int witness_kdb = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, ""); #endif /* KDB */ #if defined(DDB) || defined(KDB) /* * When DDB or KDB is enabled and witness_trace is 1, it will cause the system * to print a stack trace: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ int witness_trace = 1; SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, ""); #endif /* DDB || KDB */ #ifdef WITNESS_SKIPSPIN int witness_skipspin = 1; #else int witness_skipspin = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, ""); int badstack_sbuf_size; int witness_count = WITNESS_COUNT; SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, &witness_count, 0, ""); /* * Output channel for witness messages. By default we print to the console. */ enum witness_channel { WITNESS_CONSOLE, WITNESS_LOG, WITNESS_NONE, }; static enum witness_channel witness_channel = WITNESS_CONSOLE; SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING | CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A", "Output channel for warnings"); /* * Call this to print out the relations between locks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs"); /* * Call this to print out the witness faulty stacks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks"); static struct mtx w_mtx; /* w_list */ static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); /* w_typelist */ static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); /* lock list */ static struct lock_list_entry *w_lock_list_free = NULL; static struct witness_pendhelp pending_locks[WITNESS_PENDLIST]; static u_int pending_cnt; static int w_free_cnt, w_spin_cnt, w_sleep_cnt; SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0, ""); static struct witness *w_data; static uint8_t **w_rmatrix; static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; static struct witness_hash w_hash; /* The witness hash table. */ /* The lock order data hash */ static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT]; static struct witness_lock_order_data *w_lofree = NULL; static struct witness_lock_order_hash w_lohash; static int w_max_used_index = 0; static unsigned int w_generation = 0; static const char w_notrunning[] = "Witness not running\n"; static const char w_stillcold[] = "Witness is still cold\n"; static struct witness_order_list_entry order_lists[] = { /* * sx locks */ { "proctree", &lock_class_sx }, { "allproc", &lock_class_sx }, { "allprison", &lock_class_sx }, { NULL, NULL }, /* * Various mutexes */ { "Giant", &lock_class_mtx_sleep }, { "pipe mutex", &lock_class_mtx_sleep }, { "sigio lock", &lock_class_mtx_sleep }, { "process group", &lock_class_mtx_sleep }, { "process lock", &lock_class_mtx_sleep }, { "session", &lock_class_mtx_sleep }, { "uidinfo hash", &lock_class_rw }, #ifdef HWPMC_HOOKS { "pmc-sleep", &lock_class_mtx_sleep }, #endif { "time lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * umtx */ { "umtx lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Sockets */ { "accept", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { "so_rcv", &lock_class_mtx_sleep }, { "sellck", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Routing */ { "so_rcv", &lock_class_mtx_sleep }, { "radix node head", &lock_class_rw }, { "rtentry", &lock_class_mtx_sleep }, { "ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv4 multicast: * protocol locks before interface locks, after UDP locks. */ { "udpinp", &lock_class_rw }, { "in_multi_mtx", &lock_class_mtx_sleep }, { "igmp_mtx", &lock_class_mtx_sleep }, { "if_addr_lock", &lock_class_rw }, { NULL, NULL }, /* * IPv6 multicast: * protocol locks before interface locks, after UDP locks. */ { "udpinp", &lock_class_rw }, { "in6_multi_mtx", &lock_class_mtx_sleep }, { "mld_mtx", &lock_class_mtx_sleep }, { "if_addr_lock", &lock_class_rw }, { NULL, NULL }, /* * UNIX Domain Sockets */ { "unp_link_rwlock", &lock_class_rw }, { "unp_list_lock", &lock_class_mtx_sleep }, { "unp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UDP/IP */ { "udp", &lock_class_rw }, { "udpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ { "tcp", &lock_class_rw }, { "tcpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * BPF */ { "bpf global lock", &lock_class_mtx_sleep }, { "bpf interface lock", &lock_class_rw }, { "bpf cdev lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * NFS server */ { "nfsd_mtx", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IEEE 802.11 */ { "802.11 com lock", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Network drivers */ { "network driver", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Netgraph */ { "ng_node", &lock_class_mtx_sleep }, { "ng_worklist", &lock_class_mtx_sleep }, { NULL, NULL }, /* * CDEV */ { "vm map (system)", &lock_class_mtx_sleep }, { "vm pagequeue", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "cdev", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VM */ { "vm map (user)", &lock_class_sx }, { "vm object", &lock_class_rw }, { "vm page", &lock_class_mtx_sleep }, { "vm pagequeue", &lock_class_mtx_sleep }, { "pmap pv global", &lock_class_rw }, { "pmap", &lock_class_mtx_sleep }, { "pmap pv list", &lock_class_rw }, { "vm page free queue", &lock_class_mtx_sleep }, { NULL, NULL }, /* * kqueue/VFS interaction */ { "kqueue", &lock_class_mtx_sleep }, { "struct mount mtx", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VFS namecache */ { "ncvn", &lock_class_mtx_sleep }, { "ncbuc", &lock_class_rw }, { "vnode interlock", &lock_class_mtx_sleep }, { "ncneg", &lock_class_mtx_sleep }, { NULL, NULL }, /* * ZFS locking */ { "dn->dn_mtx", &lock_class_sx }, { "dr->dt.di.dr_mtx", &lock_class_sx }, { "db->db_mtx", &lock_class_sx }, { NULL, NULL }, /* * spin locks */ #ifdef SMP { "ap boot", &lock_class_mtx_spin }, #endif { "rm.mutex_mtx", &lock_class_mtx_spin }, { "sio", &lock_class_mtx_spin }, #ifdef __i386__ { "cy", &lock_class_mtx_spin }, #endif #ifdef __sparc64__ { "pcib_mtx", &lock_class_mtx_spin }, { "rtc_mtx", &lock_class_mtx_spin }, #endif { "scc_hwmtx", &lock_class_mtx_spin }, { "uart_hwmtx", &lock_class_mtx_spin }, { "fast_taskqueue", &lock_class_mtx_spin }, { "intr table", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-per-proc", &lock_class_mtx_spin }, #endif { "process slock", &lock_class_mtx_spin }, { "syscons video lock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, { "rm_spinlock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, { "turnstile lock", &lock_class_mtx_spin }, { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, #ifdef SMP { "smp rendezvous", &lock_class_mtx_spin }, #endif #ifdef __powerpc__ { "tlb0", &lock_class_mtx_spin }, #endif /* * leaf locks */ { "intrcnt", &lock_class_mtx_spin }, { "icu", &lock_class_mtx_spin }, #if defined(SMP) && defined(__sparc64__) { "ipi", &lock_class_mtx_spin }, #endif #ifdef __i386__ { "allpmaps", &lock_class_mtx_spin }, { "descriptor tables", &lock_class_mtx_spin }, #endif { "clk", &lock_class_mtx_spin }, { "cpuset", &lock_class_mtx_spin }, { "mprof lock", &lock_class_mtx_spin }, { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #if defined(__i386__) || defined(__amd64__) { "pcicfg", &lock_class_mtx_spin }, { "NDIS thread lock", &lock_class_mtx_spin }, #endif { "tw_osl_io_lock", &lock_class_mtx_spin }, { "tw_osl_q_lock", &lock_class_mtx_spin }, { "tw_cl_io_lock", &lock_class_mtx_spin }, { "tw_cl_intr_lock", &lock_class_mtx_spin }, { "tw_cl_gen_lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-leaf", &lock_class_mtx_spin }, #endif { "blocked lock", &lock_class_mtx_spin }, { NULL, NULL }, { NULL, NULL } }; #ifdef BLESSING /* * Pairs of locks which have been blessed * Don't complain about order problems with blessed locks */ static struct witness_blessed blessed_list[] = { }; #endif /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; /* * This global is set to 1 once the static lock orders have been enrolled * so that a warning can be issued for any spin locks enrolled later. */ static int witness_spin_warn = 0; /* Trim useless garbage from filenames. */ static const char * fixup_filename(const char *file) { if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) file += 3; return (file); } /* * The WITNESS-enabled diagnostic code. Note that the witness code does * assume that the early boot is single-threaded at least until after this * routine is completed. */ static void witness_initialize(void *dummy __unused) { struct lock_object *lock; struct witness_order_list_entry *order; struct witness *w, *w1; int i; w_data = malloc(sizeof (struct witness) * witness_count, M_WITNESS, M_WAITOK | M_ZERO); w_rmatrix = malloc(sizeof(*w_rmatrix) * (witness_count + 1), M_WITNESS, M_WAITOK | M_ZERO); for (i = 0; i < witness_count + 1; i++) { w_rmatrix[i] = malloc(sizeof(*w_rmatrix[i]) * (witness_count + 1), M_WITNESS, M_WAITOK | M_ZERO); } badstack_sbuf_size = witness_count * 256; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); CTR1(KTR_WITNESS, "%s: initializing witness", __func__); mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | MTX_NOWITNESS | MTX_NOPROFILE); for (i = witness_count - 1; i >= 0; i--) { w = &w_data[i]; memset(w, 0, sizeof(*w)); w_data[i].w_index = i; /* Witness index never changes. */ witness_free(w); } KASSERT(STAILQ_FIRST(&w_free)->w_index == 0, ("%s: Invalid list of free witness objects", __func__)); /* Witness with index 0 is not used to aid in debugging. */ STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; for (i = 0; i < witness_count; i++) { memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * (witness_count + 1)); } for (i = 0; i < LOCK_CHILDCOUNT; i++) witness_lock_list_free(&w_locklistdata[i]); witness_init_hash_tables(); /* First add in all the specified order lists. */ for (order = order_lists; order->w_name != NULL; order++) { w = enroll(order->w_name, order->w_class); if (w == NULL) continue; w->w_file = "order list"; for (order++; order->w_name != NULL; order++) { w1 = enroll(order->w_name, order->w_class); if (w1 == NULL) continue; w1->w_file = "order list"; itismychild(w, w1); w = w1; } } witness_spin_warn = 1; /* Iterate through all locks and add them to witness. */ for (i = 0; pending_locks[i].wh_lock != NULL; i++) { lock = pending_locks[i].wh_lock; KASSERT(lock->lo_flags & LO_WITNESS, ("%s: lock %s is on pending list but not LO_WITNESS", __func__, lock->lo_name)); lock->lo_witness = enroll(pending_locks[i].wh_type, LOCK_CLASS(lock)); } /* Mark the witness code as being ready for use. */ witness_cold = 0; mtx_lock(&Giant); } SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize, NULL); void witness_init(struct lock_object *lock, const char *type) { struct lock_class *class; /* Various sanity checks. */ class = LOCK_CLASS(lock); if ((lock->lo_flags & LO_RECURSABLE) != 0 && (class->lc_flags & LC_RECURSABLE) == 0) kassert_panic("%s: lock (%s) %s can not be recursable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (class->lc_flags & LC_SLEEPABLE) == 0) kassert_panic("%s: lock (%s) %s can not be sleepable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 && (class->lc_flags & LC_UPGRADABLE) == 0) kassert_panic("%s: lock (%s) %s can not be upgradable", __func__, class->lc_name, lock->lo_name); /* * If we shouldn't watch this lock, then just clear lo_witness. * Otherwise, if witness_cold is set, then it is too early to * enroll this lock, so defer it to witness_initialize() by adding * it to the pending_locks list. If it is not too early, then enroll * the lock now. */ if (witness_watch < 1 || panicstr != NULL || (lock->lo_flags & LO_WITNESS) == 0) lock->lo_witness = NULL; else if (witness_cold) { pending_locks[pending_cnt].wh_lock = lock; pending_locks[pending_cnt++].wh_type = type; if (pending_cnt > WITNESS_PENDLIST) panic("%s: pending locks list is too small, " "increase WITNESS_PENDLIST\n", __func__); } else lock->lo_witness = enroll(type, class); } void witness_destroy(struct lock_object *lock) { struct lock_class *class; struct witness *w; class = LOCK_CLASS(lock); if (witness_cold) panic("lock (%s) %s destroyed while witness_cold", class->lc_name, lock->lo_name); /* XXX: need to verify that no one holds the lock */ if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL) return; w = lock->lo_witness; mtx_lock_spin(&w_mtx); MPASS(w->w_refcount > 0); w->w_refcount--; if (w->w_refcount == 0) depart(w); mtx_unlock_spin(&w_mtx); } #ifdef DDB static void witness_ddb_compute_levels(void) { struct witness *w; /* * First clear all levels. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_ddb_level = -1; /* * Look for locks with no parents and level all their descendants. */ STAILQ_FOREACH(w, &w_all, w_list) { /* If the witness has ancestors (is not a root), skip it. */ if (w->w_num_ancestors > 0) continue; witness_ddb_level_descendants(w, 0); } } static void witness_ddb_level_descendants(struct witness *w, int l) { int i; if (w->w_ddb_level >= l) return; w->w_ddb_level = l; l++; for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_level_descendants(&w_data[i], l); } } static void witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...), struct witness *w, int indent) { int i; for (i = 0; i < indent; i++) prnt(" "); prnt("%s (type: %s, depth: %d, active refs: %d)", w->w_name, w->w_class->lc_name, w->w_ddb_level, w->w_refcount); if (w->w_displayed) { prnt(" -- (already displayed)\n"); return; } w->w_displayed = 1; if (w->w_file != NULL && w->w_line != 0) prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file), w->w_line); else prnt(" -- never acquired\n"); indent++; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (db_pager_quit) return; if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_display_descendants(prnt, &w_data[i], indent); } } static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list) { struct witness *w; STAILQ_FOREACH(w, list, w_typelist) { if (w->w_file == NULL || w->w_ddb_level > 0) continue; /* This lock has no anscestors - display its descendants. */ witness_ddb_display_descendants(prnt, w, 0); if (db_pager_quit) return; } } static void witness_ddb_display(int(*prnt)(const char *fmt, ...)) { struct witness *w; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); witness_ddb_compute_levels(); /* Clear all the displayed flags. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; /* * First, handle sleep locks which have been acquired at least * once. */ prnt("Sleep locks:\n"); witness_ddb_display_list(prnt, &w_sleep); if (db_pager_quit) return; /* * Now do spin locks which have been acquired at least once. */ prnt("\nSpin locks:\n"); witness_ddb_display_list(prnt, &w_spin); if (db_pager_quit) return; /* * Finally, any locks which have not been acquired yet. */ prnt("\nLocks which were never acquired:\n"); STAILQ_FOREACH(w, &w_all, w_list) { if (w->w_file != NULL || w->w_refcount == 0) continue; prnt("%s (type: %s, depth: %d)\n", w->w_name, w->w_class->lc_name, w->w_ddb_level); if (db_pager_quit) return; } } #endif /* DDB */ int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { if (witness_watch == -1 || panicstr != NULL) return (0); /* Require locks that witness knows about. */ if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL || lock2->lo_witness == NULL) return (EINVAL); mtx_assert(&w_mtx, MA_NOTOWNED); mtx_lock_spin(&w_mtx); /* * If we already have either an explicit or implied lock order that * is the other way around, then return an error. */ if (witness_watch && isitmydescendant(lock2->lo_witness, lock1->lo_witness)) { mtx_unlock_spin(&w_mtx); return (EDOOFUS); } /* Try to add the new order. */ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, lock2->lo_witness->w_name, lock1->lo_witness->w_name); itismychild(lock1->lo_witness, lock2->lo_witness); mtx_unlock_spin(&w_mtx); return (0); } void witness_checkorder(struct lock_object *lock, int flags, const char *file, int line, struct lock_object *interlock) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1, *lock2, *plock; struct lock_class *class, *iclass; struct witness *w, *w1; struct thread *td; int i, j; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; class = LOCK_CLASS(lock); td = curthread; if (class->lc_flags & LC_SLEEPLOCK) { /* * Since spin locks include a critical section, this check * implicitly enforces a lock order of all sleep locks before * all spin locks. */ if (td->td_critnest != 0 && !kdb_active) kassert_panic("acquiring blockable sleep lock with " "spinlock or critical section held (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); /* * If this is the first lock acquired then just return as * no order checking is needed. */ lock_list = td->td_sleeplocks; if (lock_list == NULL || lock_list->ll_count == 0) return; } else { /* * If this is the first lock, just return as no order * checking is needed. Avoid problems with thread * migration pinning the thread while checking if * spinlocks are held. If at least one spinlock is held * the thread is in a safe path and it is allowed to * unpin it. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list == NULL || lock_list->ll_count == 0) { sched_unpin(); return; } sched_unpin(); } /* * Check to see if we are recursing on a lock we already own. If * so, make sure that we don't mismatch exclusive and shared lock * acquires. */ lock1 = find_instance(lock_list, lock); if (lock1 != NULL) { if ((lock1->li_flags & LI_EXCLUSIVE) != 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("excl->share"); } if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("share->excl"); } return; } /* Warn if the interlock is not locked exactly once. */ if (interlock != NULL) { iclass = LOCK_CLASS(interlock); lock1 = find_instance(lock_list, interlock); if (lock1 == NULL) kassert_panic("interlock (%s) %s not locked @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); else if ((lock1->li_flags & LI_RECURSEMASK) != 0) kassert_panic("interlock (%s) %s recursed @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); } /* * Find the previously acquired lock, but ignore interlocks. */ plock = &lock_list->ll_children[lock_list->ll_count - 1]; if (interlock != NULL && plock->li_lock == interlock) { if (lock_list->ll_count > 1) plock = &lock_list->ll_children[lock_list->ll_count - 2]; else { lle = lock_list->ll_next; /* * The interlock is the only lock we hold, so * simply return. */ if (lle == NULL) return; plock = &lle->ll_children[lle->ll_count - 1]; } } /* * Try to perform most checks without a lock. If this succeeds we * can skip acquiring the lock and return success. Otherwise we redo * the check with the lock held to handle races with concurrent updates. */ w1 = plock->li_lock->lo_witness; if (witness_lock_order_check(w1, w)) return; mtx_lock_spin(&w_mtx); if (witness_lock_order_check(w1, w)) { mtx_unlock_spin(&w_mtx); return; } witness_lock_order_add(w1, w); /* * Check for duplicate locks of the same type. Note that we only * have to check for this on the last lock we just acquired. Any * other cases will be caught as lock order violations. */ if (w1 == w) { i = w->w_index; if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) && !(w_rmatrix[i][i] & WITNESS_REVERSAL)) { w_rmatrix[i][i] |= WITNESS_REVERSAL; w->w_reversed = 1; mtx_unlock_spin(&w_mtx); witness_output( "acquiring duplicate lock of same type: \"%s\"\n", w->w_name); witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name, fixup_filename(plock->li_file), plock->li_line); witness_output(" 2nd %s @ %s:%d\n", lock->lo_name, fixup_filename(file), line); witness_debugger(1, __func__); } else mtx_unlock_spin(&w_mtx); return; } mtx_assert(&w_mtx, MA_OWNED); /* * If we know that the lock we are acquiring comes after * the lock we most recently acquired in the lock order tree, * then there is no need for any further checks. */ if (isitmychild(w1, w)) goto out; for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN); lock1 = &lle->ll_children[i]; /* * Ignore the interlock. */ if (interlock == lock1->li_lock) continue; /* * If this lock doesn't undergo witness checking, * then skip it. */ w1 = lock1->li_lock->lo_witness; if (w1 == NULL) { KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0, ("lock missing witness structure")); continue; } /* * If we are locking Giant and this is a sleepable * lock, then skip it. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 && lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * is Giant, then skip it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && lock1->li_lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * isn't sleepable, we want to treat it as a lock * order violation to enfore a general lock order of * sleepable locks before non-sleepable locks. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) goto reversal; /* * If we are locking Giant and this is a non-sleepable * lock, then treat it as a reversal. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) goto reversal; /* * Check the lock order hierarchy for a reveresal. */ if (!isitmydescendant(w, w1)) continue; reversal: /* * We have a lock order violation, check to see if it * is allowed or has already been yelled about. */ #ifdef BLESSING /* * If the lock order is blessed, just bail. We don't * look for other lock order violations though, which * may be a bug. */ if (blessed(w, w1)) goto out; #endif /* Bail if this violation is known */ if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL) goto out; /* Record this as a violation */ w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL; w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL; w->w_reversed = w1->w_reversed = 1; witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); #ifdef WITNESS_NO_VNODE /* * There are known LORs between VNODE locks. They are * not an indication of a bug. VNODE locks are flagged * as such (LO_IS_VNODE) and we don't yell if the LOR * is between 2 VNODE locks. */ if ((lock->lo_flags & LO_IS_VNODE) != 0 && (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0) return; #endif /* * Ok, yell about it. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) witness_output( "lock order reversal: (sleepable after non-sleepable)\n"); else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) witness_output( "lock order reversal: (Giant after non-sleepable)\n"); else witness_output("lock order reversal:\n"); /* * Try to locate an earlier lock with * witness w in our list. */ do { lock2 = &lle->ll_children[i]; MPASS(lock2->li_lock != NULL); if (lock2->li_lock->lo_witness == w) break; if (i == 0 && lle->ll_next != NULL) { lle = lle->ll_next; i = lle->ll_count - 1; MPASS(i >= 0 && i < LOCK_NCHILDREN); } else i--; } while (i >= 0); if (i < 0) { witness_output(" 1st %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, w->w_name, fixup_filename(file), line); } else { witness_output(" 1st %p %s (%s) @ %s:%d\n", lock2->li_lock, lock2->li_lock->lo_name, lock2->li_lock->lo_witness->w_name, fixup_filename(lock2->li_file), lock2->li_line); witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, w->w_name, fixup_filename(file), line); } witness_debugger(1, __func__); return; } } /* * If requested, build a new lock order. However, don't build a new * relationship between a sleepable lock and Giant if it is in the * wrong direction. The correct lock order is that sleepable locks * always come before Giant. */ if (flags & LOP_NEWORDER && !(plock->li_lock == &Giant.lock_object && (lock->lo_flags & LO_SLEEPABLE) != 0)) { CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, w->w_name, plock->li_lock->lo_witness->w_name); itismychild(plock->li_lock->lo_witness, w); } out: mtx_unlock_spin(&w_mtx); } void witness_lock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct witness *w; struct thread *td; if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; td = curthread; /* Determine lock list for this lock. */ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); /* Check to see if we are recursing on a lock we already own. */ instance = find_instance(*lock_list, lock); if (instance != NULL) { instance->li_flags++; CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, td->td_proc->p_pid, lock->lo_name, instance->li_flags & LI_RECURSEMASK); instance->li_file = file; instance->li_line = line; return; } /* Update per-witness last file and line acquire. */ w->w_file = file; w->w_line = line; /* Find the next open lock instance in the list and fill it. */ lle = *lock_list; if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { lle = witness_lock_list_get(); if (lle == NULL) return; lle->ll_next = *lock_list; CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__, td->td_proc->p_pid, lle); *lock_list = lle; } instance = &lle->ll_children[lle->ll_count++]; instance->li_lock = lock; instance->li_line = line; instance->li_file = file; if ((flags & LOP_EXCLUSIVE) != 0) instance->li_flags = LI_EXCLUSIVE; else instance->li_flags = 0; CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__, td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1); } void witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "upgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "upgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "upgrade of exclusive lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "upgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags |= LI_EXCLUSIVE; } void witness_downgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "downgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "downgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "downgrade of shared lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "downgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags &= ~LI_EXCLUSIVE; } void witness_unlock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct lock_class *class; struct thread *td; register_t s; int i, j; if (witness_cold || lock->lo_witness == NULL || panicstr != NULL) return; td = curthread; class = LOCK_CLASS(lock); /* Find lock instance associated with this lock. */ if (class->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); lle = *lock_list; for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) { instance = &(*lock_list)->ll_children[i]; if (instance->li_lock == lock) goto found; } /* * When disabling WITNESS through witness_watch we could end up in * having registered locks in the td_sleeplocks queue. * We have to make sure we flush these queues, so just search for * eventual register locks and remove them. */ if (witness_watch > 0) { kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } else { return; } found: /* First, check for shared/exclusive mismatches. */ if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("excl->ushare"); } if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("share->uexcl"); } /* If we are recursed, unrecurse. */ if ((instance->li_flags & LI_RECURSEMASK) > 0) { CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, instance->li_flags); instance->li_flags--; return; } /* The lock is now being dropped, check for NORELEASE flag */ if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) { witness_output("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); kassert_panic("lock marked norelease"); } /* Otherwise, remove this item from the list. */ s = intr_disable(); CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, (*lock_list)->ll_count - 1); for (j = i; j < (*lock_list)->ll_count - 1; j++) (*lock_list)->ll_children[j] = (*lock_list)->ll_children[j + 1]; (*lock_list)->ll_count--; intr_restore(s); /* * In order to reduce contention on w_mtx, we want to keep always an * head object into lists so that frequent allocation from the * free witness pool (and subsequent locking) is avoided. * In order to maintain the current code simple, when the head * object is totally unloaded it means also that we do not have * further objects in the list, so the list ownership needs to be * hand over to another object if the current head needs to be freed. */ if ((*lock_list)->ll_count == 0) { if (*lock_list == lle) { if (lle->ll_next == NULL) return; } else lle = *lock_list; *lock_list = lle->ll_next; CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__, td->td_proc->p_pid, lle); witness_lock_list_free(lle); } } void witness_thread_exit(struct thread *td) { struct lock_list_entry *lle; int i, n; lle = td->td_sleeplocks; if (lle == NULL || panicstr != NULL) return; if (lle->ll_count != 0) { for (n = 0; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { if (n == 0) witness_output( "Thread %p exiting with the following locks held:\n", td); n++; witness_list_lock(&lle->ll_children[i], witness_output); } kassert_panic( "Thread %p cannot exit while holding sleeplocks\n", td); } witness_lock_list_free(lle); } /* * Warn if any locks other than 'lock' are held. Flags can be passed in to * exempt Giant and sleepable locks from the checks as well. If any * non-exempt locks are held, then a supplied message is printed to the * output channel along with a list of the offending locks. If indicated in the * flags then a failure results in a panic as well. */ int witness_warn(int flags, struct lock_object *lock, const char *fmt, ...) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1; struct thread *td; va_list ap; int i, n; if (witness_cold || witness_watch < 1 || panicstr != NULL) return (0); n = 0; td = curthread; for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { lock1 = &lle->ll_children[i]; if (lock1->li_lock == lock) continue; if (flags & WARN_GIANTOK && lock1->li_lock == &Giant.lock_object) continue; if (flags & WARN_SLEEPOK && (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0) continue; if (n == 0) { va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); } n++; witness_list_lock(lock1, printf); } /* * Pin the thread in order to avoid problems with thread migration. * Once that all verifies are passed about spinlocks ownership, * the thread is in a safe path and it can be unpinned. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list != NULL && lock_list->ll_count != 0) { sched_unpin(); /* * We should only have one spinlock and as long as * the flags cannot match for this locks class, * check if the first spinlock is the one curthread * should hold. */ lock1 = &lock_list->ll_children[lock_list->ll_count - 1]; if (lock_list->ll_count == 1 && lock_list->ll_next == NULL && lock1->li_lock == lock && n == 0) return (0); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); n += witness_list_locks(&lock_list, printf); } else sched_unpin(); if (flags & WARN_PANIC && n) kassert_panic("%s", __func__); else witness_debugger(n, __func__); return (n); } const char * witness_file(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return ("?"); w = lock->lo_witness; return (w->w_file); } int witness_line(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return (0); w = lock->lo_witness; return (w->w_line); } static struct witness * enroll(const char *description, struct lock_class *lock_class) { struct witness *w; struct witness_list *typelist; MPASS(description != NULL); if (witness_watch == -1 || panicstr != NULL) return (NULL); if ((lock_class->lc_flags & LC_SPINLOCK)) { if (witness_skipspin) return (NULL); else typelist = &w_spin; } else if ((lock_class->lc_flags & LC_SLEEPLOCK)) { typelist = &w_sleep; } else { kassert_panic("lock class %s is not sleep or spin", lock_class->lc_name); return (NULL); } mtx_lock_spin(&w_mtx); w = witness_hash_get(description); if (w) goto found; if ((w = witness_get()) == NULL) return (NULL); MPASS(strlen(description) < MAX_W_NAME); strcpy(w->w_name, description); w->w_class = lock_class; w->w_refcount = 1; STAILQ_INSERT_HEAD(&w_all, w, w_list); if (lock_class->lc_flags & LC_SPINLOCK) { STAILQ_INSERT_HEAD(&w_spin, w, w_typelist); w_spin_cnt++; } else if (lock_class->lc_flags & LC_SLEEPLOCK) { STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); w_sleep_cnt++; } /* Insert new witness into the hash */ witness_hash_put(w); witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); return (w); found: w->w_refcount++; if (w->w_refcount == 1) w->w_class = lock_class; mtx_unlock_spin(&w_mtx); if (lock_class != w->w_class) kassert_panic( "lock (%s) %s does not match earlier (%s) lock", description, lock_class->lc_name, w->w_class->lc_name); return (w); } static void depart(struct witness *w) { struct witness_list *list; MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { list = &w_sleep; w_sleep_cnt--; } else { list = &w_spin; w_spin_cnt--; } /* * Set file to NULL as it may point into a loadable module. */ w->w_file = NULL; w->w_line = 0; witness_increment_graph_generation(); } static void adopt(struct witness *parent, struct witness *child) { int pi, ci, i, j; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); /* If the relationship is already known, there's no work to be done. */ if (isitmychild(parent, child)) return; /* When the structure of the graph changes, bump up the generation. */ witness_increment_graph_generation(); /* * The hard part ... create the direct relationship, then propagate all * indirect relationships. */ pi = parent->w_index; ci = child->w_index; WITNESS_INDEX_ASSERT(pi); WITNESS_INDEX_ASSERT(ci); MPASS(pi != ci); w_rmatrix[pi][ci] |= WITNESS_PARENT; w_rmatrix[ci][pi] |= WITNESS_CHILD; /* * If parent was not already an ancestor of child, * then we increment the descendant and ancestor counters. */ if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) { parent->w_num_descendants++; child->w_num_ancestors++; } /* * Find each ancestor of 'pi'. Note that 'pi' itself is counted as * an ancestor of 'pi' during this loop. */ for (i = 1; i <= w_max_used_index; i++) { if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && (i != pi)) continue; /* Find each descendant of 'i' and mark it as a descendant. */ for (j = 1; j <= w_max_used_index; j++) { /* * Skip children that are already marked as * descendants of 'i'. */ if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) continue; /* * We are only interested in descendants of 'ci'. Note * that 'ci' itself is counted as a descendant of 'ci'. */ if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && (j != ci)) continue; w_rmatrix[i][j] |= WITNESS_ANCESTOR; w_rmatrix[j][i] |= WITNESS_DESCENDANT; w_data[i].w_num_descendants++; w_data[j].w_num_ancestors++; /* * Make sure we aren't marking a node as both an * ancestor and descendant. We should have caught * this as a lock order reversal earlier. */ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", i, j, w_rmatrix[i][j]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", j, i, w_rmatrix[j][i]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } } } } static void itismychild(struct witness *parent, struct witness *child) { int unlocked; MPASS(child != NULL && parent != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (!witness_lock_type_equal(parent, child)) { if (witness_cold == 0) { unlocked = 1; mtx_unlock_spin(&w_mtx); } else { unlocked = 0; } kassert_panic( "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " "the same lock type", __func__, parent->w_name, parent->w_class->lc_name, child->w_name, child->w_class->lc_name); if (unlocked) mtx_lock_spin(&w_mtx); } adopt(parent, child); } /* * Generic code for the isitmy*() functions. The rmask parameter is the * expected relationship of w1 to w2. */ static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) { unsigned char r1, r2; int i1, i2; i1 = w1->w_index; i2 = w2->w_index; WITNESS_INDEX_ASSERT(i1); WITNESS_INDEX_ASSERT(i2); r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK; r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK; /* The flags on one better be the inverse of the flags on the other */ if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) { /* Don't squawk if we're potentially racing with an update. */ if (!mtx_owned(&w_mtx)) return (0); printf("%s: rmatrix mismatch between %s (index %d) and %s " "(index %d): w_rmatrix[%d][%d] == %hhx but " "w_rmatrix[%d][%d] == %hhx\n", fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1, i2, i1, r2); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } return (r1 & rmask); } /* * Checks if @child is a direct child of @parent. */ static int isitmychild(struct witness *parent, struct witness *child) { return (_isitmyx(parent, child, WITNESS_PARENT, __func__)); } /* * Checks if @descendant is a direct or inderect descendant of @ancestor. */ static int isitmydescendant(struct witness *ancestor, struct witness *descendant) { return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__)); } #ifdef BLESSING static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < nitems(blessed_list); i++) { b = &blessed_list[i]; if (strcmp(w1->w_name, b->b_lock1) == 0) { if (strcmp(w2->w_name, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_name, b->b_lock2) == 0) if (strcmp(w2->w_name, b->b_lock1) == 0) return (1); } return (0); } #endif static struct witness * witness_get(void) { struct witness *w; int index; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (witness_watch == -1) { mtx_unlock_spin(&w_mtx); return (NULL); } if (STAILQ_EMPTY(&w_free)) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("WITNESS: unable to allocate a new witness object\n"); return (NULL); } w = STAILQ_FIRST(&w_free); STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; index = w->w_index; MPASS(index > 0 && index == w_max_used_index+1 && index < witness_count); bzero(w, sizeof(*w)); w->w_index = index; if (index > w_max_used_index) w_max_used_index = index; return (w); } static void witness_free(struct witness *w) { STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } static struct lock_list_entry * witness_lock_list_get(void) { struct lock_list_entry *lle; if (witness_watch == -1) return (NULL); mtx_lock_spin(&w_mtx); lle = w_lock_list_free; if (lle == NULL) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w_lock_list_free = lle->ll_next; mtx_unlock_spin(&w_mtx); bzero(lle, sizeof(*lle)); return (lle); } static void witness_lock_list_free(struct lock_list_entry *lle) { mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; mtx_unlock_spin(&w_mtx); } static struct lock_instance * find_instance(struct lock_list_entry *list, const struct lock_object *lock) { struct lock_list_entry *lle; struct lock_instance *instance; int i; for (lle = list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { instance = &lle->ll_children[i]; if (instance->li_lock == lock) return (instance); } return (NULL); } static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)) { struct lock_object *lock; lock = instance->li_lock; prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ? "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name); if (lock->lo_witness->w_name != lock->lo_name) prnt(" (%s)", lock->lo_witness->w_name); prnt(" r = %d (%p) locked @ %s:%d\n", instance->li_flags & LI_RECURSEMASK, lock, fixup_filename(instance->li_file), instance->li_line); } static int witness_output(const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = witness_voutput(fmt, ap); va_end(ap); return (ret); } static int witness_voutput(const char *fmt, va_list ap) { int ret; ret = 0; switch (witness_channel) { case WITNESS_CONSOLE: ret = vprintf(fmt, ap); break; case WITNESS_LOG: vlog(LOG_NOTICE, fmt, ap); break; case WITNESS_NONE: break; } return (ret); } #ifdef DDB static int witness_thread_has_locks(struct thread *td) { if (td->td_sleeplocks == NULL) return (0); return (td->td_sleeplocks->ll_count != 0); } static int witness_proc_has_locks(struct proc *p) { struct thread *td; FOREACH_THREAD_IN_PROC(p, td) { if (witness_thread_has_locks(td)) return (1); } return (0); } #endif int witness_list_locks(struct lock_list_entry **lock_list, int (*prnt)(const char *fmt, ...)) { struct lock_list_entry *lle; int i, nheld; nheld = 0; for (lle = *lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { witness_list_lock(&lle->ll_children[i], prnt); nheld++; } return (nheld); } /* * This is a bit risky at best. We call this function when we have timed * out acquiring a spin lock, and we assume that the other CPU is stuck * with this lock held. So, we go groveling around in the other CPU's * per-cpu data to try to find the lock instance for this spin lock to * see when it was last acquired. */ void witness_display_spinlock(struct lock_object *lock, struct thread *owner, int (*prnt)(const char *fmt, ...)) { struct lock_instance *instance; struct pcpu *pc; if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU) return; pc = pcpu_find(owner->td_oncpu); instance = find_instance(pc->pc_spinlocks, lock); if (instance != NULL) witness_list_lock(instance, prnt); } void witness_save(struct lock_object *lock, const char **filep, int *linep) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } *filep = instance->li_file; *linep = instance->li_line; } void witness_restore(struct lock_object *lock, const char *file, int line) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); lock->lo_witness->w_file = file; lock->lo_witness->w_line = line; if (instance == NULL) return; instance->li_file = file; instance->li_line = line; } void witness_assert(const struct lock_object *lock, int flags, const char *file, int line) { #ifdef INVARIANT_SUPPORT struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if ((class->lc_flags & LC_SLEEPLOCK) != 0) instance = find_instance(curthread->td_sleeplocks, lock); else if ((class->lc_flags & LC_SPINLOCK) != 0) instance = find_instance(PCPU_GET(spinlocks), lock); else { kassert_panic("Lock (%s) %s is not sleep or spin!", class->lc_name, lock->lo_name); return; } switch (flags) { case LA_UNLOCKED: if (instance != NULL) kassert_panic("Lock (%s) %s locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; case LA_LOCKED: case LA_LOCKED | LA_RECURSED: case LA_LOCKED | LA_NOTRECURSED: case LA_SLOCKED: case LA_SLOCKED | LA_RECURSED: case LA_SLOCKED | LA_NOTRECURSED: case LA_XLOCKED: case LA_XLOCKED | LA_RECURSED: case LA_XLOCKED | LA_NOTRECURSED: if (instance == NULL) { kassert_panic("Lock (%s) %s not locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; } if ((flags & LA_XLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "Lock (%s) %s not exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_SLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "Lock (%s) %s exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_RECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) == 0) kassert_panic("Lock (%s) %s not recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_NOTRECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic("Lock (%s) %s recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; default: kassert_panic("Invalid lock assertion at %s:%d.", fixup_filename(file), line); } #endif /* INVARIANT_SUPPORT */ } static void witness_setflag(struct lock_object *lock, int flag, int set) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } if (set) instance->li_flags |= flag; else instance->li_flags &= ~flag; } void witness_norelease(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 1); } void witness_releaseok(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 0); } #ifdef DDB static void witness_ddb_list(struct thread *td) { KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); if (witness_watch < 1) return; witness_list_locks(&td->td_sleeplocks, db_printf); /* * We only handle spinlocks if td == curthread. This is somewhat broken * if td is currently executing on some other CPU and holds spin locks * as we won't display those locks. If we had a MI way of getting * the per-cpu data for a given cpu then we could use * td->td_oncpu to get the list of spinlocks for this thread * and "fix" this. * * That still wouldn't really fix this unless we locked the scheduler * lock or stopped the other CPU to make sure it wasn't changing the * list out from under us. It is probably best to just not try to * handle threads on other CPU's for now. */ if (td == curthread && PCPU_GET(spinlocks) != NULL) witness_list_locks(PCPU_PTR(spinlocks), db_printf); } DB_SHOW_COMMAND(locks, db_witness_list) { struct thread *td; if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; witness_ddb_list(td); } DB_SHOW_ALL_COMMAND(locks, db_witness_list_all) { struct thread *td; struct proc *p; /* * It would be nice to list only threads and processes that actually * held sleep locks, but that information is currently not exported * by WITNESS. */ FOREACH_PROC_IN_SYSTEM(p) { if (!witness_proc_has_locks(p)) continue; FOREACH_THREAD_IN_PROC(p, td) { if (!witness_thread_has_locks(td)) continue; db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, p->p_comm, td, td->td_tid); witness_ddb_list(td); if (db_pager_quit) return; } } } DB_SHOW_ALIAS(alllocks, db_witness_list_all) DB_SHOW_COMMAND(witness, db_witness_display) { witness_ddb_display(db_printf); } #endif static void sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) { struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2; struct witness *tmp_w1, *tmp_w2, *w1, *w2; u_int w_rmatrix1, w_rmatrix2; int generation, i, j; tmp_data1 = NULL; tmp_data2 = NULL; tmp_w1 = NULL; tmp_w2 = NULL; /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); restart: mtx_lock_spin(&w_mtx); generation = w_generation; mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "Number of known direct relationships is %d\n", w_lohash.wloh_count); for (i = 1; i < w_max_used_index; i++) { mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w1 = &w_data[i]; if (w1->w_reversed == 0) { mtx_unlock_spin(&w_mtx); continue; } /* Copy w1 locally so we can release the spin lock. */ *tmp_w1 = *w1; mtx_unlock_spin(&w_mtx); if (tmp_w1->w_reversed == 0) continue; for (j = 1; j < w_max_used_index; j++) { if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j) continue; mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w2 = &w_data[j]; data1 = witness_lock_order_get(w1, w2); data2 = witness_lock_order_get(w2, w1); /* * Copy information locally so we can release the * spin lock. */ *tmp_w2 = *w2; w_rmatrix1 = (unsigned int)w_rmatrix[i][j]; w_rmatrix2 = (unsigned int)w_rmatrix[j][i]; if (data1) { stack_zero(&tmp_data1->wlod_stack); stack_copy(&data1->wlod_stack, &tmp_data1->wlod_stack); } if (data2 && data2 != data1) { stack_zero(&tmp_data2->wlod_stack); stack_copy(&data2->wlod_stack, &tmp_data2->wlod_stack); } mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); if (data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); stack_sbuf_print(sb, &tmp_data1->wlod_stack); sbuf_printf(sb, "\n"); } if (data2 && data2 != data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); stack_sbuf_print(sb, &tmp_data2->wlod_stack); sbuf_printf(sb, "\n"); } } } mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* * The graph changed while we were printing stack data, * try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } mtx_unlock_spin(&w_mtx); /* Free temporary storage space. */ free(tmp_data1, M_TEMP); free(tmp_data2, M_TEMP); free(tmp_w1, M_TEMP); free(tmp_w2, M_TEMP); } static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error; if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND); if (sb == NULL) return (ENOMEM); sbuf_print_witness_badstacks(sb, &req->oldidx); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } #ifdef DDB static int sbuf_db_printf_drain(void *arg __unused, const char *data, int len) { return (db_printf("%.*s", len, data)); } DB_SHOW_COMMAND(badstacks, db_witness_badstacks) { struct sbuf sb; char buffer[128]; size_t dummy; sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL); sbuf_print_witness_badstacks(&sb, &dummy); sbuf_finish(&sb); } #endif static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS) { static const struct { enum witness_channel channel; const char *name; } channels[] = { { WITNESS_CONSOLE, "console" }, { WITNESS_LOG, "log" }, { WITNESS_NONE, "none" }, }; char buf[16]; u_int i; int error; buf[0] = '\0'; for (i = 0; i < nitems(channels); i++) if (witness_channel == channels[i].channel) { snprintf(buf, sizeof(buf), "%s", channels[i].name); break; } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); error = EINVAL; for (i = 0; i < nitems(channels); i++) if (strcmp(channels[i].name, buf) == 0) { witness_channel = channels[i].channel; error = 0; break; } return (error); } static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS) { struct witness *w; struct sbuf *sb; int error; if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "\n"); mtx_lock_spin(&w_mtx); STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; STAILQ_FOREACH(w, &w_all, w_list) witness_add_fullgraph(sb, w); mtx_unlock_spin(&w_mtx); /* * Close the sbuf and return to userland. */ error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS) { int error, value; value = witness_watch; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value > 1 || value < -1 || (witness_watch == -1 && value != witness_watch)) return (EINVAL); witness_watch = value; return (0); } static void witness_add_fullgraph(struct sbuf *sb, struct witness *w) { int i; if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0)) return; w->w_displayed = 1; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) { sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name, w_data[i].w_name); witness_add_fullgraph(sb, &w_data[i]); } } } /* * A simple hash function. Takes a key pointer and a key size. If size == 0, * interprets the key as a string and reads until the null * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit * hash value computed from the key. */ static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size) { unsigned int hash = 5381; int i; /* hash = hash * 33 + key[i] */ if (size) for (i = 0; i < size; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; else for (i = 0; key[i] != 0; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; return (hash); } /* * Initializes the two witness hash tables. Called exactly once from * witness_initialize(). */ static void witness_init_hash_tables(void) { int i; MPASS(witness_cold); /* Initialize the hash tables. */ for (i = 0; i < WITNESS_HASH_SIZE; i++) w_hash.wh_array[i] = NULL; w_hash.wh_size = WITNESS_HASH_SIZE; w_hash.wh_count = 0; /* Initialize the lock order data hash. */ w_lofree = NULL; for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) { memset(&w_lodata[i], 0, sizeof(w_lodata[i])); w_lodata[i].wlod_next = w_lofree; w_lofree = &w_lodata[i]; } w_lohash.wloh_size = WITNESS_LO_HASH_SIZE; w_lohash.wloh_count = 0; for (i = 0; i < WITNESS_LO_HASH_SIZE; i++) w_lohash.wloh_array[i] = NULL; } static struct witness * witness_hash_get(const char *key) { struct witness *w; uint32_t hash; MPASS(key != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); hash = witness_hash_djb2(key, 0) % w_hash.wh_size; w = w_hash.wh_array[hash]; while (w != NULL) { if (strcmp(w->w_name, key) == 0) goto out; w = w->w_hash_next; } out: return (w); } static void witness_hash_put(struct witness *w) { uint32_t hash; MPASS(w != NULL); MPASS(w->w_name != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); KASSERT(witness_hash_get(w->w_name) == NULL, ("%s: trying to add a hash entry that already exists!", __func__)); KASSERT(w->w_hash_next == NULL, ("%s: w->w_hash_next != NULL", __func__)); hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size; w->w_hash_next = w_hash.wh_array[hash]; w_hash.wh_array[hash] = w; w_hash.wh_count++; } static struct witness_lock_order_data * witness_lock_order_get(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if ((w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) == 0) goto out; hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; data = w_lohash.wloh_array[hash]; while (data != NULL) { if (witness_lock_order_key_equal(&data->wlod_key, &key)) break; data = data->wlod_next; } out: return (data); } /* * Verify that parent and child have a known relationship, are not the same, * and child is actually a child of parent. This is done without w_mtx * to avoid contention in the common case. */ static int witness_lock_order_check(struct witness *parent, struct witness *child) { if (parent != child && w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN && isitmychild(parent, child)) return (1); return (0); } static int witness_lock_order_add(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if (w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) return (1); hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; data = w_lofree; if (data == NULL) return (0); w_lofree = data->wlod_next; data->wlod_next = w_lohash.wloh_array[hash]; data->wlod_key = key; w_lohash.wloh_array[hash] = data; w_lohash.wloh_count++; stack_zero(&data->wlod_stack); stack_save(&data->wlod_stack); return (1); } /* Call this whenever the structure of the witness graph changes. */ static void witness_increment_graph_generation(void) { if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); w_generation++; } static int witness_output_drain(void *arg __unused, const char *data, int len) { witness_output("%.*s", len, data); return (len); } static void witness_debugger(int cond, const char *msg) { char buf[32]; struct sbuf sb; struct stack st; if (!cond) return; if (witness_trace) { sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, NULL); stack_zero(&st); stack_save(&st); witness_output("stack backtrace:\n"); stack_sbuf_print_ddb(&sb, &st); sbuf_finish(&sb); } #ifdef KDB if (witness_kdb) kdb_enter(KDB_WHY_WITNESS, msg); #endif } Index: head/sys/kern/sys_capability.c =================================================================== --- head/sys/kern/sys_capability.c (revision 326270) +++ head/sys/kern/sys_capability.c (revision 326271) @@ -1,655 +1,657 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008-2011 Robert N. M. Watson * Copyright (c) 2010-2011 Jonathan Anderson * Copyright (c) 2012 FreeBSD Foundation * All rights reserved. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Portions of this software were developed by Pawel Jakub Dawidek under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * FreeBSD kernel capability facility. * * Two kernel features are implemented here: capability mode, a sandboxed mode * of execution for processes, and capabilities, a refinement on file * descriptors that allows fine-grained control over operations on the file * descriptor. Collectively, these allow processes to run in the style of a * historic "capability system" in which they can use only resources * explicitly delegated to them. This model is enforced by restricting access * to global namespaces in capability mode. * * Capabilities wrap other file descriptor types, binding them to a constant * rights mask set when the capability is created. New capabilities may be * derived from existing capabilities, but only if they have the same or a * strict subset of the rights on the original capability. * * System calls permitted in capability mode are defined in capabilities.conf; * calls must be carefully audited for safety to ensure that they don't allow * escape from a sandbox. Some calls permit only a subset of operations in * capability mode -- for example, shm_open(2) is limited to creating * anonymous, rather than named, POSIX shared memory objects. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int trap_enotcap; SYSCTL_INT(_kern, OID_AUTO, trap_enotcap, CTLFLAG_RW, &trap_enotcap, 0, "Deliver SIGTRAP on ENOTCAPABLE"); #ifdef CAPABILITY_MODE #define IOCTLS_MAX_COUNT 256 /* XXX: Is 256 sane? */ FEATURE(security_capability_mode, "Capsicum Capability Mode"); /* * System call to enter capability mode for the process. */ int sys_cap_enter(struct thread *td, struct cap_enter_args *uap) { struct ucred *newcred, *oldcred; struct proc *p; if (IN_CAPABILITY_MODE(td)) return (0); newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_flags |= CRED_FLAG_CAPMODE; proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); } /* * System call to query whether the process is in capability mode. */ int sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap) { u_int i; i = IN_CAPABILITY_MODE(td) ? 1 : 0; return (copyout(&i, uap->modep, sizeof(i))); } #else /* !CAPABILITY_MODE */ int sys_cap_enter(struct thread *td, struct cap_enter_args *uap) { return (ENOSYS); } int sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap) { return (ENOSYS); } #endif /* CAPABILITY_MODE */ #ifdef CAPABILITIES FEATURE(security_capabilities, "Capsicum Capabilities"); MALLOC_DECLARE(M_FILECAPS); static inline int _cap_check(const cap_rights_t *havep, const cap_rights_t *needp, enum ktr_cap_fail_type type) { if (!cap_rights_contains(havep, needp)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(type, needp, havep); #endif return (ENOTCAPABLE); } return (0); } /* * Test whether a capability grants the requested rights. */ int cap_check(const cap_rights_t *havep, const cap_rights_t *needp) { return (_cap_check(havep, needp, CAPFAIL_NOTCAPABLE)); } /* * Convert capability rights into VM access flags. */ u_char cap_rights_to_vmprot(cap_rights_t *havep) { u_char maxprot; maxprot = VM_PROT_NONE; if (cap_rights_is_set(havep, CAP_MMAP_R)) maxprot |= VM_PROT_READ; if (cap_rights_is_set(havep, CAP_MMAP_W)) maxprot |= VM_PROT_WRITE; if (cap_rights_is_set(havep, CAP_MMAP_X)) maxprot |= VM_PROT_EXECUTE; return (maxprot); } /* * Extract rights from a capability for monitoring purposes -- not for use in * any other way, as we want to keep all capability permission evaluation in * this one file. */ cap_rights_t * cap_rights_fde(struct filedescent *fde) { return (&fde->fde_rights); } cap_rights_t * cap_rights(struct filedesc *fdp, int fd) { return (cap_rights_fde(&fdp->fd_ofiles[fd])); } int kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights) { struct filedesc *fdp; int error; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = _cap_check(cap_rights(fdp, fd), rights, CAPFAIL_INCREASE); if (error == 0) { fdp->fd_ofiles[fd].fde_rights = *rights; if (!cap_rights_is_set(rights, CAP_IOCTL)) { free(fdp->fd_ofiles[fd].fde_ioctls, M_FILECAPS); fdp->fd_ofiles[fd].fde_ioctls = NULL; fdp->fd_ofiles[fd].fde_nioctls = 0; } if (!cap_rights_is_set(rights, CAP_FCNTL)) fdp->fd_ofiles[fd].fde_fcntls = 0; } FILEDESC_XUNLOCK(fdp); return (error); } /* * System call to limit rights of the given capability. */ int sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap) { cap_rights_t rights; int error, version; cap_rights_init(&rights); error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0])); if (error != 0) return (error); version = CAPVER(&rights); if (version != CAP_RIGHTS_VERSION_00) return (EINVAL); error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0]) * CAPARSIZE(&rights)); if (error != 0) return (error); /* Check for race. */ if (CAPVER(&rights) != version) return (EINVAL); if (!cap_rights_is_valid(&rights)) return (EINVAL); if (version != CAP_RIGHTS_VERSION) { rights.cr_rights[0] &= ~(0x3ULL << 62); rights.cr_rights[0] |= ((uint64_t)CAP_RIGHTS_VERSION << 62); } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrcaprights(&rights); #endif AUDIT_ARG_FD(uap->fd); AUDIT_ARG_RIGHTS(&rights); return (kern_cap_rights_limit(td, uap->fd, &rights)); } /* * System call to query the rights mask associated with a capability. */ int sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap) { struct filedesc *fdp; cap_rights_t rights; int error, fd, i, n; if (uap->version != CAP_RIGHTS_VERSION_00) return (EINVAL); fd = uap->fd; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_SUNLOCK(fdp); return (EBADF); } rights = *cap_rights(fdp, fd); FILEDESC_SUNLOCK(fdp); n = uap->version + 2; if (uap->version != CAPVER(&rights)) { /* * For older versions we need to check if the descriptor * doesn't contain rights not understood by the caller. * If it does, we have to return an error. */ for (i = n; i < CAPARSIZE(&rights); i++) { if ((rights.cr_rights[i] & ~(0x7FULL << 57)) != 0) return (EINVAL); } } error = copyout(&rights, uap->rightsp, sizeof(rights.cr_rights[0]) * n); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT)) ktrcaprights(&rights); #endif return (error); } /* * Test whether a capability grants the given ioctl command. * If descriptor doesn't have CAP_IOCTL, then ioctls list is empty and * ENOTCAPABLE will be returned. */ int cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd) { u_long *cmds; ssize_t ncmds; long i; FILEDESC_LOCK_ASSERT(fdp); KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("%s: invalid fd=%d", __func__, fd)); ncmds = fdp->fd_ofiles[fd].fde_nioctls; if (ncmds == -1) return (0); cmds = fdp->fd_ofiles[fd].fde_ioctls; for (i = 0; i < ncmds; i++) { if (cmds[i] == cmd) return (0); } return (ENOTCAPABLE); } /* * Check if the current ioctls list can be replaced by the new one. */ static int cap_ioctl_limit_check(struct filedesc *fdp, int fd, const u_long *cmds, size_t ncmds) { u_long *ocmds; ssize_t oncmds; u_long i; long j; oncmds = fdp->fd_ofiles[fd].fde_nioctls; if (oncmds == -1) return (0); if (oncmds < (ssize_t)ncmds) return (ENOTCAPABLE); ocmds = fdp->fd_ofiles[fd].fde_ioctls; for (i = 0; i < ncmds; i++) { for (j = 0; j < oncmds; j++) { if (cmds[i] == ocmds[j]) break; } if (j == oncmds) return (ENOTCAPABLE); } return (0); } int kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, size_t ncmds) { struct filedesc *fdp; u_long *ocmds; int error; AUDIT_ARG_FD(fd); if (ncmds > IOCTLS_MAX_COUNT) { error = EINVAL; goto out_free; } fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { error = EBADF; goto out; } error = cap_ioctl_limit_check(fdp, fd, cmds, ncmds); if (error != 0) goto out; ocmds = fdp->fd_ofiles[fd].fde_ioctls; fdp->fd_ofiles[fd].fde_ioctls = cmds; fdp->fd_ofiles[fd].fde_nioctls = ncmds; cmds = ocmds; error = 0; out: FILEDESC_XUNLOCK(fdp); out_free: free(cmds, M_FILECAPS); return (error); } int sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap) { u_long *cmds; size_t ncmds; int error; ncmds = uap->ncmds; if (ncmds > IOCTLS_MAX_COUNT) return (EINVAL); if (ncmds == 0) { cmds = NULL; } else { cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK); error = copyin(uap->cmds, cmds, sizeof(cmds[0]) * ncmds); if (error != 0) { free(cmds, M_FILECAPS); return (error); } } return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds)); } int sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap) { struct filedesc *fdp; struct filedescent *fdep; u_long *cmdsp, *dstcmds; size_t maxcmds, ncmds; int16_t count; int error, fd; fd = uap->fd; dstcmds = uap->cmds; maxcmds = uap->maxcmds; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; cmdsp = NULL; if (dstcmds != NULL) { cmdsp = malloc(sizeof(cmdsp[0]) * IOCTLS_MAX_COUNT, M_FILECAPS, M_WAITOK | M_ZERO); } FILEDESC_SLOCK(fdp); fdep = fdeget_locked(fdp, fd); if (fdep == NULL) { error = EBADF; FILEDESC_SUNLOCK(fdp); goto out; } count = fdep->fde_nioctls; if (count != -1 && cmdsp != NULL) { ncmds = MIN(count, maxcmds); memcpy(cmdsp, fdep->fde_ioctls, sizeof(cmdsp[0]) * ncmds); } FILEDESC_SUNLOCK(fdp); /* * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL) * the only sane thing we can do is to not populate the given array and * return CAP_IOCTLS_ALL. */ if (count != -1) { if (cmdsp != NULL) { error = copyout(cmdsp, dstcmds, sizeof(cmdsp[0]) * ncmds); if (error != 0) goto out; } td->td_retval[0] = count; } else { td->td_retval[0] = CAP_IOCTLS_ALL; } error = 0; out: free(cmdsp, M_FILECAPS); return (error); } /* * Test whether a capability grants the given fcntl command. */ int cap_fcntl_check_fde(struct filedescent *fde, int cmd) { uint32_t fcntlcap; fcntlcap = (1 << cmd); KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0, ("Unsupported fcntl=%d.", cmd)); if ((fde->fde_fcntls & fcntlcap) != 0) return (0); return (ENOTCAPABLE); } int cap_fcntl_check(struct filedesc *fdp, int fd, int cmd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("%s: invalid fd=%d", __func__, fd)); return (cap_fcntl_check_fde(&fdp->fd_ofiles[fd], cmd)); } int sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap) { struct filedesc *fdp; uint32_t fcntlrights; int fd; fd = uap->fd; fcntlrights = uap->fcntlrights; AUDIT_ARG_FD(fd); AUDIT_ARG_FCNTL_RIGHTS(fcntlrights); if ((fcntlrights & ~CAP_FCNTL_ALL) != 0) return (EINVAL); fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } if ((fcntlrights & ~fdp->fd_ofiles[fd].fde_fcntls) != 0) { FILEDESC_XUNLOCK(fdp); return (ENOTCAPABLE); } fdp->fd_ofiles[fd].fde_fcntls = fcntlrights; FILEDESC_XUNLOCK(fdp); return (0); } int sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap) { struct filedesc *fdp; uint32_t rights; int fd; fd = uap->fd; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_SUNLOCK(fdp); return (EBADF); } rights = fdp->fd_ofiles[fd].fde_fcntls; FILEDESC_SUNLOCK(fdp); return (copyout(&rights, uap->fcntlrightsp, sizeof(rights))); } #else /* !CAPABILITIES */ /* * Stub Capability functions for when options CAPABILITIES isn't compiled * into the kernel. */ int sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap) { return (ENOSYS); } int sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap) { return (ENOSYS); } int sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap) { return (ENOSYS); } int sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap) { return (ENOSYS); } int sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap) { return (ENOSYS); } int sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap) { return (ENOSYS); } #endif /* CAPABILITIES */ Index: head/sys/kern/sys_pipe.c =================================================================== --- head/sys/kern/sys_pipe.c (revision 326270) +++ head/sys/kern/sys_pipe.c (revision 326271) @@ -1,1841 +1,1843 @@ /*- + * SPDX-License-Identifier: BSD-4-Clause + * * Copyright (c) 1996 John S. Dyson * Copyright (c) 2012 Giovanni Trematerra * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, the sending process pins the underlying pages in * memory, and the receiving process copies directly from these pinned pages * in the sending process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable * address space available to us in pipe_map. This value is normally * autotuned, but may also be loader tuned. * * kern.ipc.pipekva - This read-only sysctl tracks the current amount of * memory in use by pipes. * * Based on how large pipekva is relative to maxpipekva, the following * will happen: * * 0% - 50%: * New pipes are given 16K of memory backing, pipes may dynamically * grow to as large as 64K where needed. * 50% - 75%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes may NOT grow. * 75% - 100%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes will be shrunk down to 4K whenever possible. * * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE * resize which MUST occur for reverse-direction pipes when they are * first used. * * Additional information about the current state of pipes may be obtained * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, * and kern.ipc.piperesizefail. * * Locking rules: There are two locks present here: A mutex, used via * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via * the flag, as mutexes can not persist over uiomove. The mutex * exists only to guard access to the flag, and is not in itself a * locking mechanism. Also note that there is only a single mutex for * both directions of a pipe. * * As pipelock() may have to sleep before it can acquire the flag, it * is important to reread all data after a call to pipelock(); everything * in the structure may have changed. */ #include "opt_compat.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ #define PIPE_PEER(pipe) \ (((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer)) /* * interfaces to the outside world */ static fo_rdwr_t pipe_read; static fo_rdwr_t pipe_write; static fo_truncate_t pipe_truncate; static fo_ioctl_t pipe_ioctl; static fo_poll_t pipe_poll; static fo_kqfilter_t pipe_kqfilter; static fo_stat_t pipe_stat; static fo_close_t pipe_close; static fo_chmod_t pipe_chmod; static fo_chown_t pipe_chown; static fo_fill_kinfo_t pipe_fill_kinfo; struct fileops pipeops = { .fo_read = pipe_read, .fo_write = pipe_write, .fo_truncate = pipe_truncate, .fo_ioctl = pipe_ioctl, .fo_poll = pipe_poll, .fo_kqfilter = pipe_kqfilter, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_chmod = pipe_chmod, .fo_chown = pipe_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = pipe_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_pipedetach(struct knote *kn); static void filt_pipedetach_notsup(struct knote *kn); static int filt_pipenotsup(struct knote *kn, long hint); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_nfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach_notsup, .f_event = filt_pipenotsup }; static struct filterops pipe_rfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_piperead }; static struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) static long amountpipekva; static int pipefragretry; static int pipeallocfail; static int piperesizefail; static int piperesizeallowed = 1; SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxpipekva, 0, "Pipe KVA limit"); SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, &amountpipekva, 0, "Pipe KVA usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, &pipeallocfail, 0, "Pipe allocation failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, &piperesizefail, 0, "Pipe resize failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, &piperesizeallowed, 0, "Pipe resizing allowed"); static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static void pipe_create(struct pipe *pipe, int backing); static void pipe_paircreate(struct thread *td, struct pipepair **p_pp); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static int pipespace_new(struct pipe *cpipe, int size); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); static int pipe_zone_init(void *mem, int size, int flags); static void pipe_zone_fini(void *mem, int size); static uma_zone_t pipe_zone; static struct unrhdr *pipeino_unr; static dev_t pipedev_ino; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, UMA_ALIGN_PTR, 0); KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); pipeino_unr = new_unrhdr(1, INT32_MAX, NULL); KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized")); pipedev_ino = devfs_alloc_cdp_inode(); KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); } static int pipe_zone_ctor(void *mem, int size, void *arg, int flags) { struct pipepair *pp; struct pipe *rpipe, *wpipe; KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); pp = (struct pipepair *)mem; /* * We zero both pipe endpoints to make sure all the kmem pointers * are NULL, flag fields are zero'd, etc. We timestamp both * endpoints with the same time. */ rpipe = &pp->pp_rpipe; bzero(rpipe, sizeof(*rpipe)); vfs_timestamp(&rpipe->pipe_ctime); rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; wpipe = &pp->pp_wpipe; bzero(wpipe, sizeof(*wpipe)); wpipe->pipe_ctime = rpipe->pipe_ctime; wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; rpipe->pipe_peer = wpipe; rpipe->pipe_pair = pp; wpipe->pipe_peer = rpipe; wpipe->pipe_pair = pp; /* * Mark both endpoints as present; they will later get free'd * one at a time. When both are free'd, then the whole pair * is released. */ rpipe->pipe_present = PIPE_ACTIVE; wpipe->pipe_present = PIPE_ACTIVE; /* * Eventually, the MAC Framework may initialize the label * in ctor or init, but for now we do it elswhere to avoid * blocking in ctor or init. */ pp->pp_label = NULL; return (0); } static int pipe_zone_init(void *mem, int size, int flags) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); pp = (struct pipepair *)mem; mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW); return (0); } static void pipe_zone_fini(void *mem, int size) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); pp = (struct pipepair *)mem; mtx_destroy(&pp->pp_mtx); } static void pipe_paircreate(struct thread *td, struct pipepair **p_pp) { struct pipepair *pp; struct pipe *rpipe, *wpipe; *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK); #ifdef MAC /* * The MAC label is shared between the connected endpoints. As a * result mac_pipe_init() and mac_pipe_create() are called once * for the pair, and not on the endpoints. */ mac_pipe_init(pp); mac_pipe_create(td->td_ucred, pp); #endif rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); /* Only the forward direction pipe is backed by default */ pipe_create(rpipe, 1); pipe_create(wpipe, 0); rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; } void pipe_named_ctor(struct pipe **ppipe, struct thread *td) { struct pipepair *pp; pipe_paircreate(td, &pp); pp->pp_rpipe.pipe_state |= PIPE_NAMED; *ppipe = &pp->pp_rpipe; } void pipe_dtor(struct pipe *dpipe) { struct pipe *peer; ino_t ino; ino = dpipe->pipe_ino; peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; funsetown(&dpipe->pipe_sigio); pipeclose(dpipe); if (peer != NULL) { funsetown(&peer->pipe_sigio); pipeclose(peer); } if (ino != 0 && ino != (ino_t)-1) free_unr(pipeino_unr, ino); } /* * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let * the zone pick up the pieces via pipeclose(). */ int kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1, struct filecaps *fcaps2) { struct file *rf, *wf; struct pipe *rpipe, *wpipe; struct pipepair *pp; int fd, fflags, error; pipe_paircreate(td, &pp); rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; error = falloc_caps(td, &rf, &fd, flags, fcaps1); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } /* An extra reference on `rf' has been held for us by falloc_caps(). */ fildes[0] = fd; fflags = FREAD | FWRITE; if ((flags & O_NONBLOCK) != 0) fflags |= FNONBLOCK; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops); error = falloc_caps(td, &wf, &fd, flags, fcaps2); if (error) { fdclose(td, rf, fildes[0]); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } /* An extra reference on `wf' has been held for us by falloc_caps(). */ finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); fildes[1] = fd; fdrop(rf, td); return (0); } #ifdef COMPAT_FREEBSD10 /* ARGSUSED */ int freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused) { int error; int fildes[2]; error = kern_pipe(td, fildes, 0, NULL, NULL); if (error) return (error); td->td_retval[0] = fildes[0]; td->td_retval[1] = fildes[1]; return (0); } #endif int sys_pipe2(struct thread *td, struct pipe2_args *uap) { int error, fildes[2]; if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) return (EINVAL); error = kern_pipe(td, fildes, uap->flags, NULL, NULL); if (error) return (error); error = copyout(fildes, uap->fildes, 2 * sizeof(int)); if (error) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace_new(cpipe, size) struct pipe *cpipe; int size; { caddr_t buffer; int error, cnt, firstseg; static int curfail = 0; static struct timeval lastfail; KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: cnt = cpipe->pipe_buffer.cnt; if (cnt > size) size = cnt; size = round_page(size); buffer = (caddr_t) vm_map_min(pipe_map); error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *) &buffer, size, 0, VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { if ((cpipe->pipe_buffer.buffer == NULL) && (size > SMALL_PIPE_SIZE)) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } if (cpipe->pipe_buffer.buffer == NULL) { pipeallocfail++; if (ppsratecheck(&lastfail, &curfail, 1)) printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); } else { piperesizefail++; } return (ENOMEM); } /* copy data, then free old resources if we're resizing */ if (cnt > 0) { if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, firstseg); if ((cnt - firstseg) > 0) bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], cpipe->pipe_buffer.in); } else { bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, cnt); } } pipe_free_kmem(cpipe); cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = cnt; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = cnt; atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); return (0); } /* * Wrapper for pipespace_new() that performs locking assertions. */ static int pipespace(cpipe, size) struct pipe *cpipe; int size; { KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipespace")); return (pipespace_new(cpipe, size)); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(cpipe, catch) struct pipe *cpipe; int catch; { int error; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); while (cpipe->pipe_state & PIPE_LOCKFL) { cpipe->pipe_state |= PIPE_LWANT; error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO, "pipelk", 0); if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipeunlock")); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } void pipeselwakeup(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { selwakeuppri(&cpipe->pipe_sel, PSOCK); if (!SEL_WAITING(&cpipe->pipe_sel)) cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); } /* * Initialize and allocate VM and memory for pipe. The structure * will start out zero'd from the ctor, so we just manage the kmem. */ static void pipe_create(pipe, backing) struct pipe *pipe; int backing; { if (backing) { /* * Note that these functions can fail if pipe map is exhausted * (as a result of too many pipes created), but we ignore the * error as it is not fatal and could be provoked by * unprivileged users. The only consequence is worse performance * with given pipe. */ if (amountpipekva > maxpipekva / 2) (void)pipespace_new(pipe, SMALL_PIPE_SIZE); else (void)pipespace_new(pipe, PIPE_SIZE); } pipe->pipe_ino = -1; } /* ARGSUSED */ static int pipe_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct pipe *rpipe; int error; int nread = 0; int size; rpipe = fp->f_data; PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (amountpipekva > (3 * maxpipekva) / 4) { if (!(rpipe->pipe_state & PIPE_DIRECTW) && (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) { PIPE_UNLOCK(rpipe); pipespace(rpipe, SMALL_PIPE_SIZE); PIPE_LOCK(rpipe); } } while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > uio->uio_resid) size = uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_map.cnt) && (rpipe->pipe_state & PIPE_DIRECTW)) { if (size > uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_map.ms, rpipe->pipe_map.pos, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW); wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) vfs_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(wpipe, uio) struct pipe *wpipe; struct uio *uio; { u_int size; int i; PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); KASSERT(wpipe->pipe_state & PIPE_DIRECTW, ("Clone attempt on non-direct write pipe!")); if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; else size = uio->uio_iov->iov_len; if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, wpipe->pipe_map.ms, PIPENPAGES)) < 0) return (EFAULT); /* * set up the control block */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; /* * and update the uio data */ uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * unmap and unwire the process buffer */ static void pipe_destroy_write_buffer(wpipe) struct pipe *wpipe; { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); wpipe->pipe_map.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(wpipe) struct pipe *wpipe; { struct uio uio; struct iovec iov; int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; wpipe->pipe_state &= ~PIPE_DIRECTW; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = size; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); PIPE_LOCK(wpipe); pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(wpipe, uio) struct pipe *wpipe; struct uio *uio; { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); error = pipelock(wpipe, 1); if (error != 0) goto error1; if ((wpipe->pipe_state & PIPE_EOF) != 0) { error = EPIPE; pipeunlock(wpipe); goto error1; } while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); if (error) goto error1; else goto retry; } wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); if (error) goto error1; else goto retry; } wpipe->pipe_state |= PIPE_DIRECTW; PIPE_UNLOCK(wpipe); error = pipe_build_write_buffer(wpipe, uio); PIPE_LOCK(wpipe); if (error) { wpipe->pipe_state &= ~PIPE_DIRECTW; pipeunlock(wpipe); goto error1; } error = 0; while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { if (wpipe->pipe_state & PIPE_EOF) { pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); pipeunlock(wpipe); error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); } if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; if (wpipe->pipe_state & PIPE_DIRECTW) { /* * this bit of trickery substitutes a kernel buffer for * the process that might be going away. */ pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } pipeunlock(wpipe); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { int error = 0; int desiredsize; ssize_t orig_resid; struct pipe *wpipe, *rpipe; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); error = pipelock(wpipe, 1); if (error) { PIPE_UNLOCK(rpipe); return (error); } /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); if (error) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* Choose a larger size if it's advantageous */ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { if (piperesizeallowed != 1) break; if (amountpipekva > maxpipekva / 2) break; if (desiredsize == BIG_PIPE_SIZE) break; desiredsize = desiredsize * 2; } /* Choose a smaller size if we're in a OOM situation */ if ((amountpipekva > (3 * maxpipekva) / 4) && (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) desiredsize = SMALL_PIPE_SIZE; /* Resize if the above determined that a new size was necessary */ if ((desiredsize != wpipe->pipe_buffer.size) && ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { PIPE_UNLOCK(wpipe); pipespace(wpipe, desiredsize); PIPE_LOCK(wpipe); } if (wpipe->pipe_buffer.size == 0) { /* * This can only happen for reverse direction use of pipes * in a complete OOM situation. */ error = ENOMEM; --wpipe->pipe_busy; pipeunlock(wpipe); PIPE_UNLOCK(wpipe); return (error); } pipeunlock(wpipe); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; pipelock(wpipe, 0); if (wpipe->pipe_state & PIPE_EOF) { pipeunlock(wpipe); error = EPIPE; break; } #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if (uio->uio_segflg == UIO_USERSPACE && uio->uio_iov->iov_len >= PIPE_MINDIRECT && wpipe->pipe_buffer.size >= PIPE_MINDIRECT && (fp->f_flag & FNONBLOCK) == 0) { pipeunlock(wpipe); error = pipe_direct_write(wpipe, uio); if (error) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); if (error) break; else continue; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { KASSERT(wpipe->pipe_buffer.in + segsize == wpipe->pipe_buffer.size, ("Pipe buffer wraparound disappeared")); /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ PIPE_UNLOCK(rpipe); error = uiomove( &wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { KASSERT(wpipe->pipe_buffer.in == size - segsize + wpipe->pipe_buffer.size, ("Expected wraparound bad")); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; KASSERT(wpipe->pipe_buffer.cnt <= wpipe->pipe_buffer.size, ("Pipe buffer overflow")); } pipeunlock(wpipe); if (error != 0) break; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; pipeunlock(wpipe); break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); if (error != 0) break; } } pipelock(wpipe, 0); --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if any byte was written. * EINTR and other interrupts are handled by generic I/O layer. * Do not pretend that I/O succeeded for obvious user error * like EFAULT. */ if (uio->uio_resid != orig_resid && error == EPIPE) error = 0; if (error == 0) vfs_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } /* ARGSUSED */ static int pipe_truncate(fp, length, active_cred, td) struct file *fp; off_t length; struct ucred *active_cred; struct thread *td; { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_state & PIPE_NAMED) error = vnops.fo_truncate(fp, length, active_cred, td); else error = invfo_truncate(fp, length, active_cred, td); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ static int pipe_ioctl(fp, cmd, data, active_cred, td) struct file *fp; u_long cmd; void *data; struct ucred *active_cred; struct thread *td; { struct pipe *mpipe = fp->f_data; int error; PIPE_LOCK(mpipe); #ifdef MAC error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); if (error) { PIPE_UNLOCK(mpipe); return (error); } #endif error = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } break; case FIONREAD: if (!(fp->f_flag & FREAD)) { *(int *)data = 0; PIPE_UNLOCK(mpipe); return (0); } if (mpipe->pipe_state & PIPE_DIRECTW) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; break; case FIOSETOWN: PIPE_UNLOCK(mpipe); error = fsetown(*(int *)data, &mpipe->pipe_sigio); goto out_unlocked; case FIOGETOWN: *(int *)data = fgetown(&mpipe->pipe_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: PIPE_UNLOCK(mpipe); error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); goto out_unlocked; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&mpipe->pipe_sigio); break; default: error = ENOTTY; break; } PIPE_UNLOCK(mpipe); out_unlocked: return (error); } static int pipe_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct pipe *rpipe; struct pipe *wpipe; int levents, revents; #ifdef MAC int error; #endif revents = 0; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); #ifdef MAC error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0)) revents |= events & (POLLIN | POLLRDNORM); if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || wpipe->pipe_buffer.size == 0))) revents |= events & (POLLOUT | POLLWRNORM); levents = events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents && fp->f_seqcount == rpipe->pipe_wgen) events |= POLLINIGNEOF; if ((events & POLLINIGNEOF) == 0) { if (rpipe->pipe_state & PIPE_EOF) { revents |= (events & (POLLIN | POLLRDNORM)); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; } } if (revents == 0) { if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); if (SEL_WAITING(&rpipe->pipe_sel)) rpipe->pipe_state |= PIPE_SEL; } if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); if (SEL_WAITING(&wpipe->pipe_sel)) wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int pipe_stat(fp, ub, active_cred, td) struct file *fp; struct stat *ub; struct ucred *active_cred; struct thread *td; { struct pipe *pipe; int new_unr; #ifdef MAC int error; #endif pipe = fp->f_data; PIPE_LOCK(pipe); #ifdef MAC error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); if (error) { PIPE_UNLOCK(pipe); return (error); } #endif /* For named pipes ask the underlying filesystem. */ if (pipe->pipe_state & PIPE_NAMED) { PIPE_UNLOCK(pipe); return (vnops.fo_stat(fp, ub, active_cred, td)); } /* * Lazily allocate an inode number for the pipe. Most pipe * users do not call fstat(2) on the pipe, which means that * postponing the inode allocation until it is must be * returned to userland is useful. If alloc_unr failed, * assign st_ino zero instead of returning an error. * Special pipe_ino values: * -1 - not yet initialized; * 0 - alloc_unr failed, return 0 as st_ino forever. */ if (pipe->pipe_ino == (ino_t)-1) { new_unr = alloc_unr(pipeino_unr); if (new_unr != -1) pipe->pipe_ino = new_unr; else pipe->pipe_ino = 0; } PIPE_UNLOCK(pipe); bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; if (pipe->pipe_state & PIPE_DIRECTW) ub->st_size = pipe->pipe_map.cnt; else ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = howmany(ub->st_size, ub->st_blksize); ub->st_atim = pipe->pipe_atime; ub->st_mtim = pipe->pipe_mtime; ub->st_ctim = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; ub->st_dev = pipedev_ino; ub->st_ino = pipe->pipe_ino; /* * Left as 0: st_nlink, st_rdev, st_flags, st_gen. */ return (0); } /* ARGSUSED */ static int pipe_close(fp, td) struct file *fp; struct thread *td; { if (fp->f_vnode != NULL) return vnops.fo_close(fp, td); fp->f_ops = &badfileops; pipe_dtor(fp->f_data); fp->f_data = NULL; return (0); } static int pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_state & PIPE_NAMED) error = vn_chmod(fp, mode, active_cred, td); else error = invfo_chmod(fp, mode, active_cred, td); return (error); } static int pipe_chown(fp, uid, gid, active_cred, td) struct file *fp; uid_t uid; gid_t gid; struct ucred *active_cred; struct thread *td; { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_state & PIPE_NAMED) error = vn_chown(fp, uid, gid, active_cred, td); else error = invfo_chown(fp, uid, gid, active_cred, td); return (error); } static int pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct pipe *pi; if (fp->f_type == DTYPE_FIFO) return (vn_fill_kinfo(fp, kif, fdp)); kif->kf_type = KF_TYPE_PIPE; pi = fp->f_data; kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; return (0); } static void pipe_free_kmem(cpipe) struct pipe *cpipe; { KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipe_free_kmem: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); vm_map_remove(pipe_map, (vm_offset_t)cpipe->pipe_buffer.buffer, (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT { cpipe->pipe_map.cnt = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(cpipe) struct pipe *cpipe; { struct pipepair *pp; struct pipe *ppipe; KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); PIPE_LOCK(cpipe); pipelock(cpipe, 0); pp = cpipe->pipe_pair; pipeselwakeup(cpipe); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ cpipe->pipe_state |= PIPE_EOF; while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT; pipeunlock(cpipe); msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); pipelock(cpipe, 0); } /* * Disconnect from peer, if any. */ ppipe = cpipe->pipe_peer; if (ppipe->pipe_present == PIPE_ACTIVE) { pipeselwakeup(ppipe); ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); } /* * Mark this endpoint as free. Release kmem resources. We * don't mark this endpoint as unused until we've finished * doing that, or the pipe might disappear out from under * us. */ PIPE_UNLOCK(cpipe); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = PIPE_CLOSING; pipeunlock(cpipe); /* * knlist_clear() may sleep dropping the PIPE_MTX. Set the * PIPE_FINALIZED, that allows other end to free the * pipe_pair, only after the knotes are completely dismantled. */ knlist_clear(&cpipe->pipe_sel.si_note, 1); cpipe->pipe_present = PIPE_FINALIZED; seldrain(&cpipe->pipe_sel); knlist_destroy(&cpipe->pipe_sel.si_note); /* * If both endpoints are now closed, release the memory for the * pipe pair. If not, unlock. */ if (ppipe->pipe_present == PIPE_FINALIZED) { PIPE_UNLOCK(cpipe); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, cpipe->pipe_pair); } else PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; /* * If a filter is requested that is not supported by this file * descriptor, don't return an error, but also don't ever generate an * event. */ if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { kn->kn_fop = &pipe_nfiltops; return (0); } if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { kn->kn_fop = &pipe_nfiltops; return (0); } cpipe = fp->f_data; PIPE_LOCK(cpipe); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { /* other end of pipe has been closed */ PIPE_UNLOCK(cpipe); return (EPIPE); } cpipe = PIPE_PEER(cpipe); break; default: PIPE_UNLOCK(cpipe); return (EINVAL); } kn->kn_hook = cpipe; knlist_add(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = kn->kn_hook; PIPE_LOCK(cpipe); knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct pipe *rpipe = kn->kn_hook; struct pipe *wpipe = rpipe->pipe_peer; int ret; PIPE_LOCK_ASSERT(rpipe, MA_OWNED); kn->kn_data = rpipe->pipe_buffer.cnt; if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; return (1); } ret = kn->kn_data > 0; return ret; } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *wpipe; wpipe = kn->kn_hook; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; return (1); } kn->kn_data = (wpipe->pipe_buffer.size > 0) ? (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF; if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; return (kn->kn_data >= PIPE_BUF); } static void filt_pipedetach_notsup(struct knote *kn) { } static int filt_pipenotsup(struct knote *kn, long hint) { return (0); } Index: head/sys/kern/sys_procdesc.c =================================================================== --- head/sys/kern/sys_procdesc.c (revision 326270) +++ head/sys/kern/sys_procdesc.c (revision 326271) @@ -1,571 +1,573 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009, 2016 Robert N. M. Watson * All rights reserved. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * FreeBSD process descriptor facility. * * Some processes are represented by a file descriptor, which will be used in * preference to signaling and pids for the purposes of process management, * and is, in effect, a form of capability. When a process descriptor is * used with a process, it ceases to be visible to certain traditional UNIX * process facilities, such as waitpid(2). * * Some semantics: * * - At most one process descriptor will exist for any process, although * references to that descriptor may be held from many processes (or even * be in flight between processes over a local domain socket). * - Last close on the process descriptor will terminate the process using * SIGKILL and reparent it to init so that there's a process to reap it * when it's done exiting. * - If the process exits before the descriptor is closed, it will not * generate SIGCHLD on termination, or be picked up by waitpid(). * - The pdkill(2) system call may be used to deliver a signal to the process * using its process descriptor. * - The pdwait4(2) system call may be used to block (or not) on a process * descriptor to collect termination information. * * Open questions: * * - How to handle ptrace(2)? * - Will we want to add a pidtoprocdesc(2) system call to allow process * descriptors to be created for processes without pdfork(2)? */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(process_descriptors, "Process Descriptors"); static uma_zone_t procdesc_zone; static fo_poll_t procdesc_poll; static fo_kqfilter_t procdesc_kqfilter; static fo_stat_t procdesc_stat; static fo_close_t procdesc_close; static fo_fill_kinfo_t procdesc_fill_kinfo; static struct fileops procdesc_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = procdesc_poll, .fo_kqfilter = procdesc_kqfilter, .fo_stat = procdesc_stat, .fo_close = procdesc_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = procdesc_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; /* * Initialize with VFS so that process descriptors are available along with * other file descriptor types. As long as it runs before init(8) starts, * there shouldn't be a problem. */ static void procdesc_init(void *dummy __unused) { procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); if (procdesc_zone == NULL) panic("procdesc_init: procdesc_zone not initialized"); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL); /* * Return a locked process given a process descriptor, or ESRCH if it has * died. */ int procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp, struct proc **p) { struct procdesc *pd; struct file *fp; int error; error = fget(td, fd, rightsp, &fp); if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { error = EBADF; goto out; } pd = fp->f_data; sx_slock(&proctree_lock); if (pd->pd_proc != NULL) { *p = pd->pd_proc; PROC_LOCK(*p); } else error = ESRCH; sx_sunlock(&proctree_lock); out: fdrop(fp, td); return (error); } /* * Function to be used by procstat(1) sysctls when returning procdesc * information. */ pid_t procdesc_pid(struct file *fp_procdesc) { struct procdesc *pd; KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC, ("procdesc_pid: !procdesc")); pd = fp_procdesc->f_data; return (pd->pd_pid); } /* * Retrieve the PID associated with a process descriptor. */ int kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp) { struct file *fp; int error; error = fget(td, fd, rightsp, &fp); if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { error = EBADF; goto out; } *pidp = procdesc_pid(fp); out: fdrop(fp, td); return (error); } /* * System call to return the pid of a process given its process descriptor. */ int sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap) { cap_rights_t rights; pid_t pid; int error; AUDIT_ARG_FD(uap->fd); error = kern_pdgetpid(td, uap->fd, cap_rights_init(&rights, CAP_PDGETPID), &pid); if (error == 0) error = copyout(&pid, uap->pidp, sizeof(pid)); return (error); } /* * When a new process is forked by pdfork(), a file descriptor is allocated * by the fork code first, then the process is forked, and then we get a * chance to set up the process descriptor. Failure is not permitted at this * point, so procdesc_new() must succeed. */ void procdesc_new(struct proc *p, int flags) { struct procdesc *pd; pd = uma_zalloc(procdesc_zone, M_WAITOK | M_ZERO); pd->pd_proc = p; pd->pd_pid = p->p_pid; p->p_procdesc = pd; pd->pd_flags = 0; if (flags & PD_DAEMON) pd->pd_flags |= PDF_DAEMON; PROCDESC_LOCK_INIT(pd); knlist_init_mtx(&pd->pd_selinfo.si_note, &pd->pd_lock); /* * Process descriptors start out with two references: one from their * struct file, and the other from their struct proc. */ refcount_init(&pd->pd_refcount, 2); } /* * Create a new process decriptor for the process that refers to it. */ int procdesc_falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { int fflags; fflags = 0; if (flags & PD_CLOEXEC) fflags = O_CLOEXEC; return (falloc_caps(td, resultfp, resultfd, fflags, fcaps)); } /* * Initialize a file with a process descriptor. */ void procdesc_finit(struct procdesc *pdp, struct file *fp) { finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops); } static void procdesc_free(struct procdesc *pd) { /* * When the last reference is released, we assert that the descriptor * has been closed, but not that the process has exited, as we will * detach the descriptor before the process dies if the descript is * closed, as we can't wait synchronously. */ if (refcount_release(&pd->pd_refcount)) { KASSERT(pd->pd_proc == NULL, ("procdesc_free: pd_proc != NULL")); KASSERT((pd->pd_flags & PDF_CLOSED), ("procdesc_free: !PDF_CLOSED")); knlist_destroy(&pd->pd_selinfo.si_note); PROCDESC_LOCK_DESTROY(pd); uma_zfree(procdesc_zone, pd); } } /* * procdesc_exit() - notify a process descriptor that its process is exiting. * We use the proctree_lock to ensure that process exit either happens * strictly before or strictly after a concurrent call to procdesc_close(). */ int procdesc_exit(struct proc *p) { struct procdesc *pd; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL")); pd = p->p_procdesc; PROCDESC_LOCK(pd); KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc, ("procdesc_exit: closed && parent not init")); pd->pd_flags |= PDF_EXITED; pd->pd_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig); /* * If the process descriptor has been closed, then we have nothing * to do; return 1 so that init will get SIGCHLD and do the reaping. * Clean up the procdesc now rather than letting it happen during * that reap. */ if (pd->pd_flags & PDF_CLOSED) { PROCDESC_UNLOCK(pd); pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); return (1); } if (pd->pd_flags & PDF_SELECTED) { pd->pd_flags &= ~PDF_SELECTED; selwakeup(&pd->pd_selinfo); } KNOTE_LOCKED(&pd->pd_selinfo.si_note, NOTE_EXIT); PROCDESC_UNLOCK(pd); return (0); } /* * When a process descriptor is reaped, perhaps as a result of close() or * pdwait4(), release the process's reference on the process descriptor. */ void procdesc_reap(struct proc *p) { struct procdesc *pd; sx_assert(&proctree_lock, SA_XLOCKED); KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL")); pd = p->p_procdesc; pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); } /* * procdesc_close() - last close on a process descriptor. If the process is * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let * init(8) clean up the mess; if not, we have to clean up the zombie ourselves. */ static int procdesc_close(struct file *fp, struct thread *td) { struct procdesc *pd; struct proc *p; KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc")); pd = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; sx_xlock(&proctree_lock); PROCDESC_LOCK(pd); pd->pd_flags |= PDF_CLOSED; PROCDESC_UNLOCK(pd); p = pd->pd_proc; if (p == NULL) { /* * This is the case where process' exit status was already * collected and procdesc_reap() was already called. */ sx_xunlock(&proctree_lock); } else { PROC_LOCK(p); AUDIT_ARG_PROCESS(p); if (p->p_state == PRS_ZOMBIE) { /* * If the process is already dead and just awaiting * reaping, do that now. This will release the * process's reference to the process descriptor when it * calls back into procdesc_reap(). */ PROC_SLOCK(p); proc_reap(curthread, p, NULL, 0); } else { /* * If the process is not yet dead, we need to kill it, * but we can't wait around synchronously for it to go * away, as that path leads to madness (and deadlocks). * First, detach the process from its descriptor so that * its exit status will be reported normally. */ pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); /* * Next, reparent it to init(8) so that there's someone * to pick up the pieces; finally, terminate with * prejudice. */ p->p_sigparent = SIGCHLD; proc_reparent(p, initproc); if ((pd->pd_flags & PDF_DAEMON) == 0) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); sx_xunlock(&proctree_lock); } } /* * Release the file descriptor's reference on the process descriptor. */ procdesc_free(pd); return (0); } static int procdesc_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct procdesc *pd; int revents; revents = 0; pd = fp->f_data; PROCDESC_LOCK(pd); if (pd->pd_flags & PDF_EXITED) revents |= POLLHUP; if (revents == 0) { selrecord(td, &pd->pd_selinfo); pd->pd_flags |= PDF_SELECTED; } PROCDESC_UNLOCK(pd); return (revents); } static void procdesc_kqops_detach(struct knote *kn) { struct procdesc *pd; pd = kn->kn_fp->f_data; knlist_remove(&pd->pd_selinfo.si_note, kn, 0); } static int procdesc_kqops_event(struct knote *kn, long hint) { struct procdesc *pd; u_int event; pd = kn->kn_fp->f_data; if (hint == 0) { /* * Initial test after registration. Generate a NOTE_EXIT in * case the process already terminated before registration. */ event = pd->pd_flags & PDF_EXITED ? NOTE_EXIT : 0; } else { /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; } /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = pd->pd_xstat; if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } static struct filterops procdesc_kqops = { .f_isfd = 1, .f_detach = procdesc_kqops_detach, .f_event = procdesc_kqops_event, }; static int procdesc_kqfilter(struct file *fp, struct knote *kn) { struct procdesc *pd; pd = fp->f_data; switch (kn->kn_filter) { case EVFILT_PROCDESC: kn->kn_fop = &procdesc_kqops; kn->kn_flags |= EV_CLEAR; knlist_add(&pd->pd_selinfo.si_note, kn, 0); return (0); default: return (EINVAL); } } static int procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct procdesc *pd; struct timeval pstart, boottime; /* * XXXRW: Perhaps we should cache some more information from the * process so that we can return it reliably here even after it has * died. For example, caching its credential data. */ bzero(sb, sizeof(*sb)); pd = fp->f_data; sx_slock(&proctree_lock); if (pd->pd_proc != NULL) { PROC_LOCK(pd->pd_proc); AUDIT_ARG_PROCESS(pd->pd_proc); /* Set birth and [acm] times to process start time. */ pstart = pd->pd_proc->p_stats->p_start; getboottime(&boottime); timevaladd(&pstart, &boottime); TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim); sb->st_atim = sb->st_birthtim; sb->st_ctim = sb->st_birthtim; sb->st_mtim = sb->st_birthtim; if (pd->pd_proc->p_state != PRS_ZOMBIE) sb->st_mode = S_IFREG | S_IRWXU; else sb->st_mode = S_IFREG; sb->st_uid = pd->pd_proc->p_ucred->cr_ruid; sb->st_gid = pd->pd_proc->p_ucred->cr_rgid; PROC_UNLOCK(pd->pd_proc); } else sb->st_mode = S_IFREG; sx_sunlock(&proctree_lock); return (0); } static int procdesc_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct procdesc *pdp; kif->kf_type = KF_TYPE_PROCDESC; pdp = fp->f_data; kif->kf_un.kf_proc.kf_pid = pdp->pd_pid; return (0); } Index: head/sys/kern/sys_process.c =================================================================== --- head/sys/kern/sys_process.c (revision 326270) +++ head/sys/kern/sys_process.c (revision 326271) @@ -1,1486 +1,1488 @@ /*- + * SPDX-License-Identifier: BSD-4-Clause + * * Copyright (c) 1994, Sean Eric Fagan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include struct ptrace_io_desc32 { int piod_op; uint32_t piod_offs; uint32_t piod_addr; uint32_t piod_len; }; struct ptrace_vm_entry32 { int pve_entry; int pve_timestamp; uint32_t pve_start; uint32_t pve_end; uint32_t pve_offset; u_int pve_prot; u_int pve_pathlen; int32_t pve_fileid; u_int pve_fsid; uint32_t pve_path; }; #endif /* * Functions implemented using PROC_ACTION(): * * proc_read_regs(proc, regs) * Get the current user-visible register set from the process * and copy it into the regs structure (). * The process is stopped at the time read_regs is called. * * proc_write_regs(proc, regs) * Update the current register set from the passed in regs * structure. Take care to avoid clobbering special CPU * registers or privileged bits in the PSL. * Depending on the architecture this may have fix-up work to do, * especially if the IAR or PCW are modified. * The process is stopped at the time write_regs is called. * * proc_read_fpregs, proc_write_fpregs * deal with the floating point register set, otherwise as above. * * proc_read_dbregs, proc_write_dbregs * deal with the processor debug register set, otherwise as above. * * proc_sstep(proc) * Arrange for the process to trap after executing a single instruction. */ #define PROC_ACTION(action) do { \ int error; \ \ PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); \ if ((td->td_proc->p_flag & P_INMEM) == 0) \ error = EIO; \ else \ error = (action); \ return (error); \ } while(0) int proc_read_regs(struct thread *td, struct reg *regs) { PROC_ACTION(fill_regs(td, regs)); } int proc_write_regs(struct thread *td, struct reg *regs) { PROC_ACTION(set_regs(td, regs)); } int proc_read_dbregs(struct thread *td, struct dbreg *dbregs) { PROC_ACTION(fill_dbregs(td, dbregs)); } int proc_write_dbregs(struct thread *td, struct dbreg *dbregs) { PROC_ACTION(set_dbregs(td, dbregs)); } /* * Ptrace doesn't support fpregs at all, and there are no security holes * or translations for fpregs, so we can just copy them. */ int proc_read_fpregs(struct thread *td, struct fpreg *fpregs) { PROC_ACTION(fill_fpregs(td, fpregs)); } int proc_write_fpregs(struct thread *td, struct fpreg *fpregs) { PROC_ACTION(set_fpregs(td, fpregs)); } #ifdef COMPAT_FREEBSD32 /* For 32 bit binaries, we need to expose the 32 bit regs layouts. */ int proc_read_regs32(struct thread *td, struct reg32 *regs32) { PROC_ACTION(fill_regs32(td, regs32)); } int proc_write_regs32(struct thread *td, struct reg32 *regs32) { PROC_ACTION(set_regs32(td, regs32)); } int proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32) { PROC_ACTION(fill_dbregs32(td, dbregs32)); } int proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32) { PROC_ACTION(set_dbregs32(td, dbregs32)); } int proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32) { PROC_ACTION(fill_fpregs32(td, fpregs32)); } int proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32) { PROC_ACTION(set_fpregs32(td, fpregs32)); } #endif int proc_sstep(struct thread *td) { PROC_ACTION(ptrace_single_step(td)); } int proc_rwmem(struct proc *p, struct uio *uio) { vm_map_t map; vm_offset_t pageno; /* page number */ vm_prot_t reqprot; int error, fault_flags, page_offset, writing; /* * Assert that someone has locked this vmspace. (Should be * curthread but we can't assert that.) This keeps the process * from exiting out from under us until this operation completes. */ PROC_ASSERT_HELD(p); PROC_LOCK_ASSERT(p, MA_NOTOWNED); /* * The map we want... */ map = &p->p_vmspace->vm_map; /* * If we are writing, then we request vm_fault() to create a private * copy of each page. Since these copies will not be writeable by the * process, we must explicity request that they be dirtied. */ writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ; fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL; /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_offset_t uva; u_int len; vm_page_t m; uva = (vm_offset_t)uio->uio_offset; /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* * Fault and hold the page on behalf of the process. */ error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m); if (error != KERN_SUCCESS) { if (error == KERN_RESOURCE_SHORTAGE) error = ENOMEM; else error = EFAULT; break; } /* * Now do the i/o move. */ error = uiomove_fromphys(&m, page_offset, len, uio); /* Make the I-cache coherent for breakpoints. */ if (writing && error == 0) { vm_map_lock_read(map); if (vm_map_check_protection(map, pageno, pageno + PAGE_SIZE, VM_PROT_EXECUTE)) vm_sync_icache(map, uva, len); vm_map_unlock_read(map); } /* * Release the page. */ vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } while (error == 0 && uio->uio_resid > 0); return (error); } static ssize_t proc_iop(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len, enum uio_rw rw) { struct iovec iov; struct uio uio; ssize_t slen; int error; MPASS(len < SSIZE_MAX); slen = (ssize_t)len; iov.iov_base = (caddr_t)buf; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = va; uio.uio_resid = slen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = rw; uio.uio_td = td; error = proc_rwmem(p, &uio); if (uio.uio_resid == slen) return (-1); return (slen - uio.uio_resid); } ssize_t proc_readmem(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len) { return (proc_iop(td, p, va, buf, len, UIO_READ)); } ssize_t proc_writemem(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len) { return (proc_iop(td, p, va, buf, len, UIO_WRITE)); } static int ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve) { struct vattr vattr; vm_map_t map; vm_map_entry_t entry; vm_object_t obj, tobj, lobj; struct vmspace *vm; struct vnode *vp; char *freepath, *fullpath; u_int pathlen; int error, index; error = 0; obj = NULL; vm = vmspace_acquire_ref(p); map = &vm->vm_map; vm_map_lock_read(map); do { entry = map->header.next; index = 0; while (index < pve->pve_entry && entry != &map->header) { entry = entry->next; index++; } if (index != pve->pve_entry) { error = EINVAL; break; } while (entry != &map->header && (entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { entry = entry->next; index++; } if (entry == &map->header) { error = ENOENT; break; } /* We got an entry. */ pve->pve_entry = index + 1; pve->pve_timestamp = map->timestamp; pve->pve_start = entry->start; pve->pve_end = entry->end - 1; pve->pve_offset = entry->offset; pve->pve_prot = entry->protection; /* Backing object's path needed? */ if (pve->pve_pathlen == 0) break; pathlen = pve->pve_pathlen; pve->pve_pathlen = 0; obj = entry->object.vm_object; if (obj != NULL) VM_OBJECT_RLOCK(obj); } while (0); vm_map_unlock_read(map); pve->pve_fsid = VNOVAL; pve->pve_fileid = VNOVAL; if (error == 0 && obj != NULL) { lobj = obj; for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; pve->pve_offset += tobj->backing_object_offset; } vp = vm_object_vnode(lobj); if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { freepath = NULL; fullpath = NULL; vn_fullpath(td, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) { pve->pve_fileid = vattr.va_fileid; pve->pve_fsid = vattr.va_fsid; } vput(vp); if (fullpath != NULL) { pve->pve_pathlen = strlen(fullpath) + 1; if (pve->pve_pathlen <= pathlen) { error = copyout(fullpath, pve->pve_path, pve->pve_pathlen); } else error = ENAMETOOLONG; } if (freepath != NULL) free(freepath, M_TEMP); } } vmspace_free(vm); if (error == 0) CTR3(KTR_PTRACE, "PT_VM_ENTRY: pid %d, entry %d, start %p", p->p_pid, pve->pve_entry, pve->pve_start); return (error); } #ifdef COMPAT_FREEBSD32 static int ptrace_vm_entry32(struct thread *td, struct proc *p, struct ptrace_vm_entry32 *pve32) { struct ptrace_vm_entry pve; int error; pve.pve_entry = pve32->pve_entry; pve.pve_pathlen = pve32->pve_pathlen; pve.pve_path = (void *)(uintptr_t)pve32->pve_path; error = ptrace_vm_entry(td, p, &pve); if (error == 0) { pve32->pve_entry = pve.pve_entry; pve32->pve_timestamp = pve.pve_timestamp; pve32->pve_start = pve.pve_start; pve32->pve_end = pve.pve_end; pve32->pve_offset = pve.pve_offset; pve32->pve_prot = pve.pve_prot; pve32->pve_fileid = pve.pve_fileid; pve32->pve_fsid = pve.pve_fsid; } pve32->pve_pathlen = pve.pve_pathlen; return (error); } static void ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl, struct ptrace_lwpinfo32 *pl32) { bzero(pl32, sizeof(*pl32)); pl32->pl_lwpid = pl->pl_lwpid; pl32->pl_event = pl->pl_event; pl32->pl_flags = pl->pl_flags; pl32->pl_sigmask = pl->pl_sigmask; pl32->pl_siglist = pl->pl_siglist; siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo); strcpy(pl32->pl_tdname, pl->pl_tdname); pl32->pl_child_pid = pl->pl_child_pid; pl32->pl_syscall_code = pl->pl_syscall_code; pl32->pl_syscall_narg = pl->pl_syscall_narg; } #endif /* COMPAT_FREEBSD32 */ /* * Process debugging system call. */ #ifndef _SYS_SYSPROTO_H_ struct ptrace_args { int req; pid_t pid; caddr_t addr; int data; }; #endif #ifdef COMPAT_FREEBSD32 /* * This CPP subterfuge is to try and reduce the number of ifdefs in * the body of the code. * COPYIN(uap->addr, &r.reg, sizeof r.reg); * becomes either: * copyin(uap->addr, &r.reg, sizeof r.reg); * or * copyin(uap->addr, &r.reg32, sizeof r.reg32); * .. except this is done at runtime. */ #define COPYIN(u, k, s) wrap32 ? \ copyin(u, k ## 32, s ## 32) : \ copyin(u, k, s) #define COPYOUT(k, u, s) wrap32 ? \ copyout(k ## 32, u, s ## 32) : \ copyout(k, u, s) #else #define COPYIN(u, k, s) copyin(u, k, s) #define COPYOUT(k, u, s) copyout(k, u, s) #endif int sys_ptrace(struct thread *td, struct ptrace_args *uap) { /* * XXX this obfuscation is to reduce stack usage, but the register * structs may be too large to put on the stack anyway. */ union { struct ptrace_io_desc piod; struct ptrace_lwpinfo pl; struct ptrace_vm_entry pve; struct dbreg dbreg; struct fpreg fpreg; struct reg reg; #ifdef COMPAT_FREEBSD32 struct dbreg32 dbreg32; struct fpreg32 fpreg32; struct reg32 reg32; struct ptrace_io_desc32 piod32; struct ptrace_lwpinfo32 pl32; struct ptrace_vm_entry32 pve32; #endif char args[nitems(td->td_sa.args) * sizeof(register_t)]; int ptevents; } r; void *addr; int error = 0; #ifdef COMPAT_FREEBSD32 int wrap32 = 0; if (SV_CURPROC_FLAG(SV_ILP32)) wrap32 = 1; #endif AUDIT_ARG_PID(uap->pid); AUDIT_ARG_CMD(uap->req); AUDIT_ARG_VALUE(uap->data); addr = &r; switch (uap->req) { case PT_GET_EVENT_MASK: case PT_GETREGS: case PT_GETFPREGS: case PT_GETDBREGS: case PT_LWPINFO: case PT_GET_SC_ARGS: break; case PT_SETREGS: error = COPYIN(uap->addr, &r.reg, sizeof r.reg); break; case PT_SETFPREGS: error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg); break; case PT_SETDBREGS: error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg); break; case PT_SET_EVENT_MASK: if (uap->data != sizeof(r.ptevents)) error = EINVAL; else error = copyin(uap->addr, &r.ptevents, uap->data); break; case PT_IO: error = COPYIN(uap->addr, &r.piod, sizeof r.piod); break; case PT_VM_ENTRY: error = COPYIN(uap->addr, &r.pve, sizeof r.pve); break; default: addr = uap->addr; break; } if (error) return (error); error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data); if (error) return (error); switch (uap->req) { case PT_VM_ENTRY: error = COPYOUT(&r.pve, uap->addr, sizeof r.pve); break; case PT_IO: error = COPYOUT(&r.piod, uap->addr, sizeof r.piod); break; case PT_GETREGS: error = COPYOUT(&r.reg, uap->addr, sizeof r.reg); break; case PT_GETFPREGS: error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg); break; case PT_GETDBREGS: error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg); break; case PT_GET_EVENT_MASK: /* NB: The size in uap->data is validated in kern_ptrace(). */ error = copyout(&r.ptevents, uap->addr, uap->data); break; case PT_LWPINFO: /* NB: The size in uap->data is validated in kern_ptrace(). */ error = copyout(&r.pl, uap->addr, uap->data); break; case PT_GET_SC_ARGS: error = copyout(r.args, uap->addr, MIN(uap->data, sizeof(r.args))); break; } return (error); } #undef COPYIN #undef COPYOUT #ifdef COMPAT_FREEBSD32 /* * PROC_READ(regs, td2, addr); * becomes either: * proc_read_regs(td2, addr); * or * proc_read_regs32(td2, addr); * .. except this is done at runtime. There is an additional * complication in that PROC_WRITE disallows 32 bit consumers * from writing to 64 bit address space targets. */ #define PROC_READ(w, t, a) wrap32 ? \ proc_read_ ## w ## 32(t, a) : \ proc_read_ ## w (t, a) #define PROC_WRITE(w, t, a) wrap32 ? \ (safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \ proc_write_ ## w (t, a) #else #define PROC_READ(w, t, a) proc_read_ ## w (t, a) #define PROC_WRITE(w, t, a) proc_write_ ## w (t, a) #endif void proc_set_traced(struct proc *p, bool stop) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_TRACED; if (stop) p->p_flag2 |= P2_PTRACE_FSTP; p->p_ptevents = PTRACE_DEFAULT; p->p_oppid = p->p_pptr->p_pid; } int kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data) { struct iovec iov; struct uio uio; struct proc *curp, *p, *pp; struct thread *td2 = NULL, *td3; struct ptrace_io_desc *piod = NULL; struct ptrace_lwpinfo *pl; int error, num, tmp; int proctree_locked = 0; lwpid_t tid = 0, *buf; #ifdef COMPAT_FREEBSD32 int wrap32 = 0, safe = 0; struct ptrace_io_desc32 *piod32 = NULL; struct ptrace_lwpinfo32 *pl32 = NULL; struct ptrace_lwpinfo plr; #endif curp = td->td_proc; /* Lock proctree before locking the process. */ switch (req) { case PT_TRACE_ME: case PT_ATTACH: case PT_STEP: case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: case PT_FOLLOW_FORK: case PT_LWP_EVENTS: case PT_GET_EVENT_MASK: case PT_SET_EVENT_MASK: case PT_DETACH: case PT_GET_SC_ARGS: sx_xlock(&proctree_lock); proctree_locked = 1; break; default: break; } if (req == PT_TRACE_ME) { p = td->td_proc; PROC_LOCK(p); } else { if (pid <= PID_MAX) { if ((p = pfind(pid)) == NULL) { if (proctree_locked) sx_xunlock(&proctree_lock); return (ESRCH); } } else { td2 = tdfind(pid, -1); if (td2 == NULL) { if (proctree_locked) sx_xunlock(&proctree_lock); return (ESRCH); } p = td2->td_proc; tid = pid; pid = p->p_pid; } } AUDIT_ARG_PROCESS(p); if ((p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto fail; } if ((error = p_cansee(td, p)) != 0) goto fail; if ((error = p_candebug(td, p)) != 0) goto fail; /* * System processes can't be debugged. */ if ((p->p_flag & P_SYSTEM) != 0) { error = EINVAL; goto fail; } if (tid == 0) { if ((p->p_flag & P_STOPPED_TRACE) != 0) { KASSERT(p->p_xthread != NULL, ("NULL p_xthread")); td2 = p->p_xthread; } else { td2 = FIRST_THREAD_IN_PROC(p); } tid = td2->td_tid; } #ifdef COMPAT_FREEBSD32 /* * Test if we're a 32 bit client and what the target is. * Set the wrap controls accordingly. */ if (SV_CURPROC_FLAG(SV_ILP32)) { if (SV_PROC_FLAG(td2->td_proc, SV_ILP32)) safe = 1; wrap32 = 1; } #endif /* * Permissions check */ switch (req) { case PT_TRACE_ME: /* * Always legal, when there is a parent process which * could trace us. Otherwise, reject. */ if ((p->p_flag & P_TRACED) != 0) { error = EBUSY; goto fail; } if (p->p_pptr == initproc) { error = EPERM; goto fail; } break; case PT_ATTACH: /* Self */ if (p == td->td_proc) { error = EINVAL; goto fail; } /* Already traced */ if (p->p_flag & P_TRACED) { error = EBUSY; goto fail; } /* Can't trace an ancestor if you're being traced. */ if (curp->p_flag & P_TRACED) { for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) { if (pp == p) { error = EINVAL; goto fail; } } } /* OK */ break; case PT_CLEARSTEP: /* Allow thread to clear single step for itself */ if (td->td_tid == tid) break; /* FALLTHROUGH */ default: /* not being traced... */ if ((p->p_flag & P_TRACED) == 0) { error = EPERM; goto fail; } /* not being traced by YOU */ if (p->p_pptr != td->td_proc) { error = EBUSY; goto fail; } /* not currently stopped */ if ((p->p_flag & P_STOPPED_TRACE) == 0 || p->p_suspcount != p->p_numthreads || (p->p_flag & P_WAITED) == 0) { error = EBUSY; goto fail; } /* OK */ break; } /* Keep this process around until we finish this request. */ _PHOLD(p); #ifdef FIX_SSTEP /* * Single step fixup ala procfs */ FIX_SSTEP(td2); #endif /* * Actually do the requests */ td->td_retval[0] = 0; switch (req) { case PT_TRACE_ME: /* set my trace flag and "owner" so it can read/write me */ proc_set_traced(p, false); if (p->p_flag & P_PPWAIT) p->p_flag |= P_PPTRACE; CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid); break; case PT_ATTACH: /* security check done above */ /* * It would be nice if the tracing relationship was separate * from the parent relationship but that would require * another set of links in the proc struct or for "wait" * to scan the entire proc table. To make life easier, * we just re-parent the process we're trying to trace. * The old parent is remembered so we can put things back * on a "detach". */ proc_set_traced(p, true); if (p->p_pptr != td->td_proc) { proc_reparent(p, td->td_proc); } CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid, p->p_oppid); sx_xunlock(&proctree_lock); proctree_locked = 0; MPASS(p->p_xthread == NULL); MPASS((p->p_flag & P_STOPPED_TRACE) == 0); /* * If already stopped due to a stop signal, clear the * existing stop before triggering a traced SIGSTOP. */ if ((p->p_flag & P_STOPPED_SIG) != 0) { PROC_SLOCK(p); p->p_flag &= ~(P_STOPPED_SIG | P_WAITED); thread_unsuspend(p); PROC_SUNLOCK(p); } kern_psignal(p, SIGSTOP); break; case PT_CLEARSTEP: CTR2(KTR_PTRACE, "PT_CLEARSTEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_clear_single_step(td2); break; case PT_SETSTEP: CTR2(KTR_PTRACE, "PT_SETSTEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_single_step(td2); break; case PT_SUSPEND: CTR2(KTR_PTRACE, "PT_SUSPEND: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_SUSPEND; thread_lock(td2); td2->td_flags |= TDF_NEEDSUSPCHK; thread_unlock(td2); break; case PT_RESUME: CTR2(KTR_PTRACE, "PT_RESUME: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags &= ~TDB_SUSPEND; break; case PT_FOLLOW_FORK: CTR3(KTR_PTRACE, "PT_FOLLOW_FORK: pid %d %s -> %s", p->p_pid, p->p_ptevents & PTRACE_FORK ? "enabled" : "disabled", data ? "enabled" : "disabled"); if (data) p->p_ptevents |= PTRACE_FORK; else p->p_ptevents &= ~PTRACE_FORK; break; case PT_LWP_EVENTS: CTR3(KTR_PTRACE, "PT_LWP_EVENTS: pid %d %s -> %s", p->p_pid, p->p_ptevents & PTRACE_LWP ? "enabled" : "disabled", data ? "enabled" : "disabled"); if (data) p->p_ptevents |= PTRACE_LWP; else p->p_ptevents &= ~PTRACE_LWP; break; case PT_GET_EVENT_MASK: if (data != sizeof(p->p_ptevents)) { error = EINVAL; break; } CTR2(KTR_PTRACE, "PT_GET_EVENT_MASK: pid %d mask %#x", p->p_pid, p->p_ptevents); *(int *)addr = p->p_ptevents; break; case PT_SET_EVENT_MASK: if (data != sizeof(p->p_ptevents)) { error = EINVAL; break; } tmp = *(int *)addr; if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX | PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) { error = EINVAL; break; } CTR3(KTR_PTRACE, "PT_SET_EVENT_MASK: pid %d mask %#x -> %#x", p->p_pid, p->p_ptevents, tmp); p->p_ptevents = tmp; break; case PT_GET_SC_ARGS: CTR1(KTR_PTRACE, "PT_GET_SC_ARGS: pid %d", p->p_pid); if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) == 0 #ifdef COMPAT_FREEBSD32 || (wrap32 && !safe) #endif ) { error = EINVAL; break; } bzero(addr, sizeof(td2->td_sa.args)); #ifdef COMPAT_FREEBSD32 if (wrap32) for (num = 0; num < nitems(td2->td_sa.args); num++) ((uint32_t *)addr)[num] = (uint32_t) td2->td_sa.args[num]; else #endif bcopy(td2->td_sa.args, addr, td2->td_sa.narg * sizeof(register_t)); break; case PT_STEP: case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: case PT_DETACH: /* Zero means do not send any signal */ if (data < 0 || data > _SIG_MAXSIG) { error = EINVAL; break; } switch (req) { case PT_STEP: CTR3(KTR_PTRACE, "PT_STEP: tid %d (pid %d), sig = %d", td2->td_tid, p->p_pid, data); error = ptrace_single_step(td2); if (error) goto out; break; case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: if (addr != (void *)1) { error = ptrace_set_pc(td2, (u_long)(uintfptr_t)addr); if (error) goto out; } switch (req) { case PT_TO_SCE: p->p_ptevents |= PTRACE_SCE; CTR4(KTR_PTRACE, "PT_TO_SCE: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_TO_SCX: p->p_ptevents |= PTRACE_SCX; CTR4(KTR_PTRACE, "PT_TO_SCX: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_SYSCALL: p->p_ptevents |= PTRACE_SYSCALL; CTR4(KTR_PTRACE, "PT_SYSCALL: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_CONTINUE: CTR3(KTR_PTRACE, "PT_CONTINUE: pid %d, PC = %#lx, sig = %d", p->p_pid, (u_long)(uintfptr_t)addr, data); break; } break; case PT_DETACH: /* * Reset the process parent. * * NB: This clears P_TRACED before reparenting * a detached process back to its original * parent. Otherwise the debugee will be set * as an orphan of the debugger. */ p->p_flag &= ~(P_TRACED | P_WAITED); if (p->p_oppid != p->p_pptr->p_pid) { PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); pp = proc_realparent(p); proc_reparent(p, pp); if (pp == initproc) p->p_sigparent = SIGCHLD; CTR3(KTR_PTRACE, "PT_DETACH: pid %d reparented to pid %d, sig %d", p->p_pid, pp->p_pid, data); } else CTR2(KTR_PTRACE, "PT_DETACH: pid %d, sig %d", p->p_pid, data); p->p_oppid = 0; p->p_ptevents = 0; FOREACH_THREAD_IN_PROC(p, td3) { if ((td3->td_dbgflags & TDB_FSTP) != 0) { sigqueue_delete(&td3->td_sigqueue, SIGSTOP); } td3->td_dbgflags &= ~(TDB_XSIG | TDB_FSTP | TDB_SUSPEND); } if ((p->p_flag2 & P2_PTRACE_FSTP) != 0) { sigqueue_delete(&p->p_sigqueue, SIGSTOP); p->p_flag2 &= ~P2_PTRACE_FSTP; } /* should we send SIGCHLD? */ /* childproc_continued(p); */ break; } sx_xunlock(&proctree_lock); proctree_locked = 0; sendsig: MPASS(proctree_locked == 0); /* * Clear the pending event for the thread that just * reported its event (p_xthread). This may not be * the thread passed to PT_CONTINUE, PT_STEP, etc. if * the debugger is resuming a different thread. * * Deliver any pending signal via the reporting thread. */ MPASS(p->p_xthread != NULL); p->p_xthread->td_dbgflags &= ~TDB_XSIG; p->p_xthread->td_xsig = data; p->p_xthread = NULL; p->p_xsig = data; /* * P_WKILLED is insurance that a PT_KILL/SIGKILL * always works immediately, even if another thread is * unsuspended first and attempts to handle a * different signal or if the POSIX.1b style signal * queue cannot accommodate any new signals. */ if (data == SIGKILL) p->p_flag |= P_WKILLED; /* * Unsuspend all threads. To leave a thread * suspended, use PT_SUSPEND to suspend it before * continuing the process. */ PROC_SLOCK(p); p->p_flag &= ~(P_STOPPED_TRACE | P_STOPPED_SIG | P_WAITED); thread_unsuspend(p); PROC_SUNLOCK(p); break; case PT_WRITE_I: case PT_WRITE_D: td2->td_dbgflags |= TDB_USERWR; PROC_UNLOCK(p); error = 0; if (proc_writemem(td, p, (off_t)(uintptr_t)addr, &data, sizeof(int)) != sizeof(int)) error = ENOMEM; else CTR3(KTR_PTRACE, "PT_WRITE: pid %d: %p <= %#x", p->p_pid, addr, data); PROC_LOCK(p); break; case PT_READ_I: case PT_READ_D: PROC_UNLOCK(p); error = tmp = 0; if (proc_readmem(td, p, (off_t)(uintptr_t)addr, &tmp, sizeof(int)) != sizeof(int)) error = ENOMEM; else CTR3(KTR_PTRACE, "PT_READ: pid %d: %p >= %#x", p->p_pid, addr, tmp); td->td_retval[0] = tmp; PROC_LOCK(p); break; case PT_IO: #ifdef COMPAT_FREEBSD32 if (wrap32) { piod32 = addr; iov.iov_base = (void *)(uintptr_t)piod32->piod_addr; iov.iov_len = piod32->piod_len; uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs; uio.uio_resid = piod32->piod_len; } else #endif { piod = addr; iov.iov_base = piod->piod_addr; iov.iov_len = piod->piod_len; uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs; uio.uio_resid = piod->piod_len; } uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_segflg = UIO_USERSPACE; uio.uio_td = td; #ifdef COMPAT_FREEBSD32 tmp = wrap32 ? piod32->piod_op : piod->piod_op; #else tmp = piod->piod_op; #endif switch (tmp) { case PIOD_READ_D: case PIOD_READ_I: CTR3(KTR_PTRACE, "PT_IO: pid %d: READ (%p, %#x)", p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid); uio.uio_rw = UIO_READ; break; case PIOD_WRITE_D: case PIOD_WRITE_I: CTR3(KTR_PTRACE, "PT_IO: pid %d: WRITE (%p, %#x)", p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid); td2->td_dbgflags |= TDB_USERWR; uio.uio_rw = UIO_WRITE; break; default: error = EINVAL; goto out; } PROC_UNLOCK(p); error = proc_rwmem(p, &uio); #ifdef COMPAT_FREEBSD32 if (wrap32) piod32->piod_len -= uio.uio_resid; else #endif piod->piod_len -= uio.uio_resid; PROC_LOCK(p); break; case PT_KILL: CTR1(KTR_PTRACE, "PT_KILL: pid %d", p->p_pid); data = SIGKILL; goto sendsig; /* in PT_CONTINUE above */ case PT_SETREGS: CTR2(KTR_PTRACE, "PT_SETREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(regs, td2, addr); break; case PT_GETREGS: CTR2(KTR_PTRACE, "PT_GETREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(regs, td2, addr); break; case PT_SETFPREGS: CTR2(KTR_PTRACE, "PT_SETFPREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(fpregs, td2, addr); break; case PT_GETFPREGS: CTR2(KTR_PTRACE, "PT_GETFPREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(fpregs, td2, addr); break; case PT_SETDBREGS: CTR2(KTR_PTRACE, "PT_SETDBREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(dbregs, td2, addr); break; case PT_GETDBREGS: CTR2(KTR_PTRACE, "PT_GETDBREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(dbregs, td2, addr); break; case PT_LWPINFO: if (data <= 0 || #ifdef COMPAT_FREEBSD32 (!wrap32 && data > sizeof(*pl)) || (wrap32 && data > sizeof(*pl32))) { #else data > sizeof(*pl)) { #endif error = EINVAL; break; } #ifdef COMPAT_FREEBSD32 if (wrap32) { pl = &plr; pl32 = addr; } else #endif pl = addr; bzero(pl, sizeof(*pl)); pl->pl_lwpid = td2->td_tid; pl->pl_event = PL_EVENT_NONE; pl->pl_flags = 0; if (td2->td_dbgflags & TDB_XSIG) { pl->pl_event = PL_EVENT_SIGNAL; if (td2->td_si.si_signo != 0 && #ifdef COMPAT_FREEBSD32 ((!wrap32 && data >= offsetof(struct ptrace_lwpinfo, pl_siginfo) + sizeof(pl->pl_siginfo)) || (wrap32 && data >= offsetof(struct ptrace_lwpinfo32, pl_siginfo) + sizeof(struct siginfo32))) #else data >= offsetof(struct ptrace_lwpinfo, pl_siginfo) + sizeof(pl->pl_siginfo) #endif ){ pl->pl_flags |= PL_FLAG_SI; pl->pl_siginfo = td2->td_si; } } if (td2->td_dbgflags & TDB_SCE) pl->pl_flags |= PL_FLAG_SCE; else if (td2->td_dbgflags & TDB_SCX) pl->pl_flags |= PL_FLAG_SCX; if (td2->td_dbgflags & TDB_EXEC) pl->pl_flags |= PL_FLAG_EXEC; if (td2->td_dbgflags & TDB_FORK) { pl->pl_flags |= PL_FLAG_FORKED; pl->pl_child_pid = td2->td_dbg_forked; if (td2->td_dbgflags & TDB_VFORK) pl->pl_flags |= PL_FLAG_VFORKED; } else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) == TDB_VFORK) pl->pl_flags |= PL_FLAG_VFORK_DONE; if (td2->td_dbgflags & TDB_CHILD) pl->pl_flags |= PL_FLAG_CHILD; if (td2->td_dbgflags & TDB_BORN) pl->pl_flags |= PL_FLAG_BORN; if (td2->td_dbgflags & TDB_EXIT) pl->pl_flags |= PL_FLAG_EXITED; pl->pl_sigmask = td2->td_sigmask; pl->pl_siglist = td2->td_siglist; strcpy(pl->pl_tdname, td2->td_name); if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) != 0) { pl->pl_syscall_code = td2->td_sa.code; pl->pl_syscall_narg = td2->td_sa.narg; } else { pl->pl_syscall_code = 0; pl->pl_syscall_narg = 0; } #ifdef COMPAT_FREEBSD32 if (wrap32) ptrace_lwpinfo_to32(pl, pl32); #endif CTR6(KTR_PTRACE, "PT_LWPINFO: tid %d (pid %d) event %d flags %#x child pid %d syscall %d", td2->td_tid, p->p_pid, pl->pl_event, pl->pl_flags, pl->pl_child_pid, pl->pl_syscall_code); break; case PT_GETNUMLWPS: CTR2(KTR_PTRACE, "PT_GETNUMLWPS: pid %d: %d threads", p->p_pid, p->p_numthreads); td->td_retval[0] = p->p_numthreads; break; case PT_GETLWPLIST: CTR3(KTR_PTRACE, "PT_GETLWPLIST: pid %d: data %d, actual %d", p->p_pid, data, p->p_numthreads); if (data <= 0) { error = EINVAL; break; } num = imin(p->p_numthreads, data); PROC_UNLOCK(p); buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK); tmp = 0; PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (tmp >= num) break; buf[tmp++] = td2->td_tid; } PROC_UNLOCK(p); error = copyout(buf, addr, tmp * sizeof(lwpid_t)); free(buf, M_TEMP); if (!error) td->td_retval[0] = tmp; PROC_LOCK(p); break; case PT_VM_TIMESTAMP: CTR2(KTR_PTRACE, "PT_VM_TIMESTAMP: pid %d: timestamp %d", p->p_pid, p->p_vmspace->vm_map.timestamp); td->td_retval[0] = p->p_vmspace->vm_map.timestamp; break; case PT_VM_ENTRY: PROC_UNLOCK(p); #ifdef COMPAT_FREEBSD32 if (wrap32) error = ptrace_vm_entry32(td, p, addr); else #endif error = ptrace_vm_entry(td, p, addr); PROC_LOCK(p); break; default: #ifdef __HAVE_PTRACE_MACHDEP if (req >= PT_FIRSTMACH) { PROC_UNLOCK(p); error = cpu_ptrace(td2, req, addr, data); PROC_LOCK(p); } else #endif /* Unknown request. */ error = EINVAL; break; } out: /* Drop our hold on this process now that the request has completed. */ _PRELE(p); fail: PROC_UNLOCK(p); if (proctree_locked) sx_xunlock(&proctree_lock); return (error); } #undef PROC_READ #undef PROC_WRITE /* * Stop a process because of a debugging event; * stay stopped until p->p_step is cleared * (cleared by PIOCCONT in procfs). */ void stopevent(struct proc *p, unsigned int event, unsigned int val) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_step = 1; CTR3(KTR_PTRACE, "stopevent: pid %d event %u val %u", p->p_pid, event, val); do { if (event != S_EXIT) p->p_xsig = val; p->p_xthread = NULL; p->p_stype = event; /* Which event caused the stop? */ wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */ msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0); } while (p->p_step); } Index: head/sys/kern/sysv_ipc.c =================================================================== --- head/sys/kern/sysv_ipc.c (revision 326270) +++ head/sys/kern/sysv_ipc.c (revision 326271) @@ -1,246 +1,248 @@ /* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */ /*- + * SPDX-License-Identifier: BSD-4-Clause + * * Copyright (c) 1994 Herb Peyerl * Copyright (c) 2006 nCircle Network Security, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson for the TrustedBSD * Project under contract to nCircle Network Security, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Herb Peyerl. * 4. The name of Herb Peyerl may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include void (*shmfork_hook)(struct proc *, struct proc *) = NULL; void (*shmexit_hook)(struct vmspace *) = NULL; /* called from kern_fork.c */ void shmfork(p1, p2) struct proc *p1, *p2; { if (shmfork_hook != NULL) shmfork_hook(p1, p2); return; } /* called from kern_exit.c */ void shmexit(struct vmspace *vm) { if (shmexit_hook != NULL) shmexit_hook(vm); return; } /* * Check for IPC permission. * * Note: The MAC Framework does not require any modifications to the * ipcperm() function, as access control checks are performed throughout the * implementation of each primitive. Those entry point calls complement the * ipcperm() discertionary checks. Unlike file system discretionary access * control, the original create of an object is given the same rights as the * current owner. */ int ipcperm(struct thread *td, struct ipc_perm *perm, int acc_mode) { struct ucred *cred = td->td_ucred; int error, obj_mode, dac_granted, priv_granted; dac_granted = 0; if (cred->cr_uid == perm->cuid || cred->cr_uid == perm->uid) { obj_mode = perm->mode; dac_granted |= IPC_M; } else if (groupmember(perm->gid, cred) || groupmember(perm->cgid, cred)) { obj_mode = perm->mode; obj_mode <<= 3; } else { obj_mode = perm->mode; obj_mode <<= 6; } /* * While the System V IPC permission model allows IPC_M to be * granted, as part of the mode, our implementation requires * privilege to adminster the object if not the owner or creator. */ #if 0 if (obj_mode & IPC_M) dac_granted |= IPC_M; #endif if (obj_mode & IPC_R) dac_granted |= IPC_R; if (obj_mode & IPC_W) dac_granted |= IPC_W; /* * Simple case: all required rights are granted by DAC. */ if ((dac_granted & acc_mode) == acc_mode) return (0); /* * Privilege is required to satisfy the request. */ priv_granted = 0; if ((acc_mode & IPC_M) && !(dac_granted & IPC_M)) { error = priv_check(td, PRIV_IPC_ADMIN); if (error == 0) priv_granted |= IPC_M; } if ((acc_mode & IPC_R) && !(dac_granted & IPC_R)) { error = priv_check(td, PRIV_IPC_READ); if (error == 0) priv_granted |= IPC_R; } if ((acc_mode & IPC_W) && !(dac_granted & IPC_W)) { error = priv_check(td, PRIV_IPC_WRITE); if (error == 0) priv_granted |= IPC_W; } if (((dac_granted | priv_granted) & acc_mode) == acc_mode) return (0); else return (EACCES); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) void ipcperm_old2new(struct ipc_perm_old *old, struct ipc_perm *new) { new->cuid = old->cuid; new->cgid = old->cgid; new->uid = old->uid; new->gid = old->gid; new->mode = old->mode; new->seq = old->seq; new->key = old->key; } void ipcperm_new2old(struct ipc_perm *new, struct ipc_perm_old *old) { /* XXX: How to handle ID's > USHORT_MAX? */ old->cuid = new->cuid; old->cgid = new->cgid; old->uid = new->uid; old->gid = new->gid; old->mode = new->mode; old->seq = new->seq; old->key = new->key; } #endif #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include #include #include #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) void freebsd32_ipcperm_old_in(struct ipc_perm32_old *ip32, struct ipc_perm *ip) { CP(*ip32, *ip, cuid); CP(*ip32, *ip, cgid); CP(*ip32, *ip, uid); CP(*ip32, *ip, gid); CP(*ip32, *ip, mode); CP(*ip32, *ip, seq); CP(*ip32, *ip, key); } void freebsd32_ipcperm_old_out(struct ipc_perm *ip, struct ipc_perm32_old *ip32) { CP(*ip, *ip32, cuid); CP(*ip, *ip32, cgid); CP(*ip, *ip32, uid); CP(*ip, *ip32, gid); CP(*ip, *ip32, mode); CP(*ip, *ip32, seq); CP(*ip, *ip32, key); } #endif void freebsd32_ipcperm_in(struct ipc_perm32 *ip32, struct ipc_perm *ip) { CP(*ip32, *ip, cuid); CP(*ip32, *ip, cgid); CP(*ip32, *ip, uid); CP(*ip32, *ip, gid); CP(*ip32, *ip, mode); CP(*ip32, *ip, seq); CP(*ip32, *ip, key); } void freebsd32_ipcperm_out(struct ipc_perm *ip, struct ipc_perm32 *ip32) { CP(*ip, *ip32, cuid); CP(*ip, *ip32, cgid); CP(*ip, *ip32, uid); CP(*ip, *ip32, gid); CP(*ip, *ip32, mode); CP(*ip, *ip32, seq); CP(*ip, *ip32, key); } #endif Index: head/sys/kern/sysv_msg.c =================================================================== --- head/sys/kern/sysv_msg.c (revision 326270) +++ head/sys/kern/sysv_msg.c (revision 326271) @@ -1,1880 +1,1882 @@ /*- * Implementation of SVID messages * * Author: Daniel Boulet * * Copyright 1993 Daniel Boulet and RTMX Inc. * * This system call was implemented by Daniel Boulet under contract from RTMX. * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. */ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003-2005 McAfee, Inc. * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(sysv_msg, "System V message queues support"); static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues"); static int msginit(void); static int msgunload(void); static int sysvmsg_modload(struct module *, int, void *); static void msq_remove(struct msqid_kernel *); static struct prison *msg_find_prison(struct ucred *); static int msq_prison_cansee(struct prison *, struct msqid_kernel *); static int msg_prison_check(void *, void *); static int msg_prison_set(void *, void *); static int msg_prison_get(void *, void *); static int msg_prison_remove(void *, void *); static void msg_prison_cleanup(struct prison *); #ifdef MSG_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) (void)0 #endif static void msg_freehdr(struct msg *msghdr); #ifndef MSGSSZ #define MSGSSZ 8 /* Each segment must be 2^N long */ #endif #ifndef MSGSEG #define MSGSEG 2048 /* must be less than 32767 */ #endif #define MSGMAX (MSGSSZ*MSGSEG) #ifndef MSGMNB #define MSGMNB 2048 /* max # of bytes in a queue */ #endif #ifndef MSGMNI #define MSGMNI 40 #endif #ifndef MSGTQL #define MSGTQL 40 #endif /* * Based on the configuration parameters described in an SVR2 (yes, two) * config(1m) man page. * * Each message is broken up and stored in segments that are msgssz bytes * long. For efficiency reasons, this should be a power of two. Also, * it doesn't make sense if it is less than 8 or greater than about 256. * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of * two between 8 and 1024 inclusive (and panic's if it isn't). */ struct msginfo msginfo = { MSGMAX, /* max chars in a message */ MSGMNI, /* # of message queue identifiers */ MSGMNB, /* max chars in a queue */ MSGTQL, /* max messages in system */ MSGSSZ, /* size of a message segment */ /* (must be small power of 2 greater than 4) */ MSGSEG /* number of message segments */ }; /* * macros to convert between msqid_ds's and msqid's. * (specific to this implementation) */ #define MSQID(ix,ds) ((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000)) #define MSQID_IX(id) ((id) & 0xffff) #define MSQID_SEQ(id) (((id) >> 16) & 0xffff) /* * The rest of this file is specific to this particular implementation. */ struct msgmap { short next; /* next segment in buffer */ /* -1 -> available */ /* 0..(MSGSEG-1) -> index of next segment */ }; #define MSG_LOCKED 01000 /* Is this msqid_ds locked? */ static int nfree_msgmaps; /* # of free map entries */ static short free_msgmaps; /* head of linked list of free map entries */ static struct msg *free_msghdrs;/* list of free msg headers */ static char *msgpool; /* MSGMAX byte long msg buffer pool */ static struct msgmap *msgmaps; /* MSGSEG msgmap structures */ static struct msg *msghdrs; /* MSGTQL msg headers */ static struct msqid_kernel *msqids; /* MSGMNI msqid_kernel struct's */ static struct mtx msq_mtx; /* global mutex for message queues. */ static unsigned msg_prison_slot;/* prison OSD slot */ static struct syscall_helper_data msg_syscalls[] = { SYSCALL_INIT_HELPER(msgctl), SYSCALL_INIT_HELPER(msgget), SYSCALL_INIT_HELPER(msgsnd), SYSCALL_INIT_HELPER(msgrcv), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL_INIT_HELPER(msgsys), SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl), #endif SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include static struct syscall_helper_data msg32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_msgctl), SYSCALL32_INIT_HELPER(freebsd32_msgsnd), SYSCALL32_INIT_HELPER(freebsd32_msgrcv), SYSCALL32_INIT_HELPER_COMPAT(msgget), SYSCALL32_INIT_HELPER(freebsd32_msgsys), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl), #endif SYSCALL_INIT_LAST }; #endif static int msginit() { struct prison *pr; void **rsv; int i, error; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_CHECK] = msg_prison_check, [PR_METHOD_SET] = msg_prison_set, [PR_METHOD_GET] = msg_prison_get, [PR_METHOD_REMOVE] = msg_prison_remove, }; msginfo.msgmax = msginfo.msgseg * msginfo.msgssz; msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK); msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK); msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK); msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG, M_WAITOK); /* * msginfo.msgssz should be a power of two for efficiency reasons. * It is also pretty silly if msginfo.msgssz is less than 8 * or greater than about 256 so ... */ i = 8; while (i < 1024 && i != msginfo.msgssz) i <<= 1; if (i != msginfo.msgssz) { DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz, msginfo.msgssz)); panic("msginfo.msgssz not a small power of 2"); } if (msginfo.msgseg > 32767) { DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg)); panic("msginfo.msgseg > 32767"); } for (i = 0; i < msginfo.msgseg; i++) { if (i > 0) msgmaps[i-1].next = i; msgmaps[i].next = -1; /* implies entry is available */ } free_msgmaps = 0; nfree_msgmaps = msginfo.msgseg; for (i = 0; i < msginfo.msgtql; i++) { msghdrs[i].msg_type = 0; if (i > 0) msghdrs[i-1].msg_next = &msghdrs[i]; msghdrs[i].msg_next = NULL; #ifdef MAC mac_sysvmsg_init(&msghdrs[i]); #endif } free_msghdrs = &msghdrs[0]; for (i = 0; i < msginfo.msgmni; i++) { msqids[i].u.msg_qbytes = 0; /* implies entry is available */ msqids[i].u.msg_perm.seq = 0; /* reset to a known value */ msqids[i].u.msg_perm.mode = 0; #ifdef MAC mac_sysvmsq_init(&msqids[i]); #endif } mtx_init(&msq_mtx, "msq", NULL, MTX_DEF); /* Set current prisons according to their allow.sysvipc. */ msg_prison_slot = osd_jail_register(NULL, methods); rsv = osd_reserve(msg_prison_slot); prison_lock(&prison0); (void)osd_jail_set_reserved(&prison0, msg_prison_slot, rsv, &prison0); prison_unlock(&prison0); rsv = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { if (rsv == NULL) rsv = osd_reserve(msg_prison_slot); prison_lock(pr); if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) { (void)osd_jail_set_reserved(pr, msg_prison_slot, rsv, &prison0); rsv = NULL; } prison_unlock(pr); } if (rsv != NULL) osd_free_reserved(rsv); sx_sunlock(&allprison_lock); error = syscall_helper_register(msg_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(msg32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int msgunload() { struct msqid_kernel *msqkptr; int msqid; #ifdef MAC int i; #endif syscall_helper_unregister(msg_syscalls); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(msg32_syscalls); #endif for (msqid = 0; msqid < msginfo.msgmni; msqid++) { msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 || (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) break; } if (msqid != msginfo.msgmni) return (EBUSY); if (msg_prison_slot != 0) osd_jail_deregister(msg_prison_slot); #ifdef MAC for (i = 0; i < msginfo.msgtql; i++) mac_sysvmsg_destroy(&msghdrs[i]); for (msqid = 0; msqid < msginfo.msgmni; msqid++) mac_sysvmsq_destroy(&msqids[msqid]); #endif free(msgpool, M_MSG); free(msgmaps, M_MSG); free(msghdrs, M_MSG); free(msqids, M_MSG); mtx_destroy(&msq_mtx); return (0); } static int sysvmsg_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = msginit(); if (error != 0) msgunload(); break; case MOD_UNLOAD: error = msgunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvmsg_mod = { "sysvmsg", &sysvmsg_modload, NULL }; DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST); MODULE_VERSION(sysvmsg, 1); static void msg_freehdr(msghdr) struct msg *msghdr; { while (msghdr->msg_ts > 0) { short next; if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg) panic("msghdr->msg_spot out of range"); next = msgmaps[msghdr->msg_spot].next; msgmaps[msghdr->msg_spot].next = free_msgmaps; free_msgmaps = msghdr->msg_spot; nfree_msgmaps++; msghdr->msg_spot = next; if (msghdr->msg_ts >= msginfo.msgssz) msghdr->msg_ts -= msginfo.msgssz; else msghdr->msg_ts = 0; } if (msghdr->msg_spot != -1) panic("msghdr->msg_spot != -1"); msghdr->msg_next = free_msghdrs; free_msghdrs = msghdr; #ifdef MAC mac_sysvmsg_cleanup(msghdr); #endif } static void msq_remove(struct msqid_kernel *msqkptr) { struct msg *msghdr; racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1); racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum); racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes); crfree(msqkptr->cred); msqkptr->cred = NULL; /* Free the message headers */ msghdr = msqkptr->u.msg_first; while (msghdr != NULL) { struct msg *msghdr_tmp; /* Free the segments of each message */ msqkptr->u.msg_cbytes -= msghdr->msg_ts; msqkptr->u.msg_qnum--; msghdr_tmp = msghdr; msghdr = msghdr->msg_next; msg_freehdr(msghdr_tmp); } if (msqkptr->u.msg_cbytes != 0) panic("msg_cbytes is screwed up"); if (msqkptr->u.msg_qnum != 0) panic("msg_qnum is screwed up"); msqkptr->u.msg_qbytes = 0; /* Mark it as free */ #ifdef MAC mac_sysvmsq_cleanup(msqkptr); #endif wakeup(msqkptr); } static struct prison * msg_find_prison(struct ucred *cred) { struct prison *pr, *rpr; pr = cred->cr_prison; prison_lock(pr); rpr = osd_jail_get(pr, msg_prison_slot); prison_unlock(pr); return rpr; } static int msq_prison_cansee(struct prison *rpr, struct msqid_kernel *msqkptr) { if (msqkptr->cred == NULL || !(rpr == msqkptr->cred->cr_prison || prison_ischild(rpr, msqkptr->cred->cr_prison))) return (EINVAL); return (0); } #ifndef _SYS_SYSPROTO_H_ struct msgctl_args { int msqid; int cmd; struct msqid_ds *buf; }; #endif int sys_msgctl(struct thread *td, struct msgctl_args *uap) { int msqid = uap->msqid; int cmd = uap->cmd; struct msqid_ds msqbuf; int error; DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf)); if (cmd == IPC_SET && (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0) return (error); error = kern_msgctl(td, msqid, cmd, &msqbuf); if (cmd == IPC_STAT && error == 0) error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds)); return (error); } int kern_msgctl(td, msqid, cmd, msqbuf) struct thread *td; int msqid; int cmd; struct msqid_ds *msqbuf; { int rval, error, msqix; struct msqid_kernel *msqkptr; struct prison *rpr; rpr = msg_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); AUDIT_ARG_SVIPC_CMD(cmd); AUDIT_ARG_SVIPC_ID(msqid); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); return (EINVAL); } msqkptr = &msqids[msqix]; mtx_lock(&msq_mtx); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such msqid\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } error = msq_prison_cansee(rpr, msqkptr); if (error != 0) { DPRINTF(("requester can't see prison\n")); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd); if (error != 0) goto done2; #endif error = 0; rval = 0; switch (cmd) { case IPC_RMID: { #ifdef MAC struct msg *msghdr; #endif if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; #ifdef MAC /* * Check that the thread has MAC access permissions to * individual msghdrs. Note: We need to do this in a * separate loop because the actual loop alters the * msq/msghdr info as it progresses, and there is no going * back if half the way through we discover that the * thread cannot free a certain msghdr. The msq will get * into an inconsistent state. */ for (msghdr = msqkptr->u.msg_first; msghdr != NULL; msghdr = msghdr->msg_next) { error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr); if (error != 0) goto done2; } #endif msq_remove(msqkptr); } break; case IPC_SET: AUDIT_ARG_SVIPC_PERM(&msqbuf->msg_perm); if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) { error = priv_check(td, PRIV_IPC_MSGSIZE); if (error) goto done2; } if (msqbuf->msg_qbytes > msginfo.msgmnb) { DPRINTF(("can't increase msg_qbytes beyond %d" "(truncating)\n", msginfo.msgmnb)); msqbuf->msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */ } if (msqbuf->msg_qbytes == 0) { DPRINTF(("can't reduce msg_qbytes to 0\n")); error = EINVAL; /* non-standard errno! */ goto done2; } msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid; /* change the owner */ msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid; /* change the owner */ msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) | (msqbuf->msg_perm.mode & 0777); msqkptr->u.msg_qbytes = msqbuf->msg_qbytes; msqkptr->u.msg_ctime = time_second; break; case IPC_STAT: if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; } *msqbuf = msqkptr->u; if (td->td_ucred->cr_prison != msqkptr->cred->cr_prison) msqbuf->msg_perm.key = IPC_PRIVATE; break; default: DPRINTF(("invalid command %d\n", cmd)); error = EINVAL; goto done2; } if (error == 0) td->td_retval[0] = rval; done2: mtx_unlock(&msq_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct msgget_args { key_t key; int msgflg; }; #endif int sys_msgget(struct thread *td, struct msgget_args *uap) { int msqid, error = 0; int key = uap->key; int msgflg = uap->msgflg; struct ucred *cred = td->td_ucred; struct msqid_kernel *msqkptr = NULL; DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg)); if (msg_find_prison(cred) == NULL) return (ENOSYS); mtx_lock(&msq_mtx); if (key != IPC_PRIVATE) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 && msqkptr->cred != NULL && msqkptr->cred->cr_prison == cred->cr_prison && msqkptr->u.msg_perm.key == key) break; } if (msqid < msginfo.msgmni) { DPRINTF(("found public key\n")); if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) { DPRINTF(("not exclusive\n")); error = EEXIST; goto done2; } AUDIT_ARG_SVIPC_ID(IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm)); if ((error = ipcperm(td, &msqkptr->u.msg_perm, msgflg & 0700))) { DPRINTF(("requester doesn't have 0%o access\n", msgflg & 0700)); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqget(cred, msqkptr); if (error != 0) goto done2; #endif goto found; } } DPRINTF(("need to allocate the msqid_ds\n")); if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { /* * Look for an unallocated and unlocked msqid_ds. * msqid_ds's can be locked by msgsnd or msgrcv while * they are copying the message in/out. We can't * re-use the entry until they release it. */ msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes == 0 && (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0) break; } if (msqid == msginfo.msgmni) { DPRINTF(("no more msqid_ds's available\n")); error = ENOSPC; goto done2; } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_NMSGQ, 1); PROC_UNLOCK(td->td_proc); if (error != 0) { error = ENOSPC; goto done2; } } #endif DPRINTF(("msqid %d is available\n", msqid)); msqkptr->u.msg_perm.key = key; msqkptr->u.msg_perm.cuid = cred->cr_uid; msqkptr->u.msg_perm.uid = cred->cr_uid; msqkptr->u.msg_perm.cgid = cred->cr_gid; msqkptr->u.msg_perm.gid = cred->cr_gid; msqkptr->u.msg_perm.mode = (msgflg & 0777); msqkptr->cred = crhold(cred); /* Make sure that the returned msqid is unique */ msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff; msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; msqkptr->u.msg_cbytes = 0; msqkptr->u.msg_qnum = 0; msqkptr->u.msg_qbytes = msginfo.msgmnb; msqkptr->u.msg_lspid = 0; msqkptr->u.msg_lrpid = 0; msqkptr->u.msg_stime = 0; msqkptr->u.msg_rtime = 0; msqkptr->u.msg_ctime = time_second; #ifdef MAC mac_sysvmsq_create(cred, msqkptr); #endif AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm); } else { DPRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto done2; } found: /* Construct the unique msqid */ td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm); done2: mtx_unlock(&msq_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct msgsnd_args { int msqid; const void *msgp; /* XXX msgp is actually mtext. */ size_t msgsz; int msgflg; }; #endif int kern_msgsnd(struct thread *td, int msqid, const void *msgp, size_t msgsz, int msgflg, long mtype) { int msqix, segs_needed, error = 0; struct msqid_kernel *msqkptr; struct msg *msghdr; struct prison *rpr; short next; #ifdef RACCT size_t saved_msgsz; #endif rpr = msg_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); mtx_lock(&msq_mtx); AUDIT_ARG_SVIPC_ID(msqid); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); error = EINVAL; goto done2; } msqkptr = &msqids[msqix]; AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such message queue id\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } if ((error = msq_prison_cansee(rpr, msqkptr))) { DPRINTF(("requester can't see prison\n")); goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) { DPRINTF(("requester doesn't have write access\n")); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr); if (error != 0) goto done2; #endif #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) { PROC_UNLOCK(td->td_proc); error = EAGAIN; goto done2; } saved_msgsz = msgsz; if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) { racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); PROC_UNLOCK(td->td_proc); error = EAGAIN; goto done2; } PROC_UNLOCK(td->td_proc); } #endif segs_needed = howmany(msgsz, msginfo.msgssz); DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz, segs_needed)); for (;;) { int need_more_resources = 0; /* * check msgsz * (inside this loop in case msg_qbytes changes while we sleep) */ if (msgsz > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n")); error = EINVAL; goto done3; } if (msqkptr->u.msg_perm.mode & MSG_LOCKED) { DPRINTF(("msqid is locked\n")); need_more_resources = 1; } if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n")); need_more_resources = 1; } if (segs_needed > nfree_msgmaps) { DPRINTF(("segs_needed > nfree_msgmaps\n")); need_more_resources = 1; } if (free_msghdrs == NULL) { DPRINTF(("no more msghdrs\n")); need_more_resources = 1; } if (need_more_resources) { int we_own_it; if ((msgflg & IPC_NOWAIT) != 0) { DPRINTF(("need more resources but caller " "doesn't want to wait\n")); error = EAGAIN; goto done3; } if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) { DPRINTF(("we don't own the msqid_ds\n")); we_own_it = 0; } else { /* Force later arrivals to wait for our request */ DPRINTF(("we own the msqid_ds\n")); msqkptr->u.msg_perm.mode |= MSG_LOCKED; we_own_it = 1; } DPRINTF(("msgsnd: goodnight\n")); error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH, "msgsnd", hz); DPRINTF(("msgsnd: good morning, error=%d\n", error)); if (we_own_it) msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; if (error == EWOULDBLOCK) { DPRINTF(("msgsnd: timed out\n")); continue; } if (error != 0) { DPRINTF(("msgsnd: interrupted system call\n")); error = EINTR; goto done3; } /* * Make sure that the msq queue still exists */ if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("msqid deleted\n")); error = EIDRM; goto done3; } } else { DPRINTF(("got all the resources that we need\n")); break; } } /* * We have the resources that we need. * Make sure! */ if (msqkptr->u.msg_perm.mode & MSG_LOCKED) panic("msg_perm.mode & MSG_LOCKED"); if (segs_needed > nfree_msgmaps) panic("segs_needed > nfree_msgmaps"); if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) panic("msgsz + msg_cbytes > msg_qbytes"); if (free_msghdrs == NULL) panic("no more msghdrs"); /* * Re-lock the msqid_ds in case we page-fault when copying in the * message */ if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) panic("msqid_ds is already locked"); msqkptr->u.msg_perm.mode |= MSG_LOCKED; /* * Allocate a message header */ msghdr = free_msghdrs; free_msghdrs = msghdr->msg_next; msghdr->msg_spot = -1; msghdr->msg_ts = msgsz; msghdr->msg_type = mtype; #ifdef MAC /* * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here * immediately? Or, should it be checked just before the msg is * enqueued in the msgq (as it is done now)? */ mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr); #endif /* * Allocate space for the message */ while (segs_needed > 0) { if (nfree_msgmaps <= 0) panic("not enough msgmaps"); if (free_msgmaps == -1) panic("nil free_msgmaps"); next = free_msgmaps; if (next <= -1) panic("next too low #1"); if (next >= msginfo.msgseg) panic("next out of range #1"); DPRINTF(("allocating segment %d to message\n", next)); free_msgmaps = msgmaps[next].next; nfree_msgmaps--; msgmaps[next].next = msghdr->msg_spot; msghdr->msg_spot = next; segs_needed--; } /* * Validate the message type */ if (msghdr->msg_type < 1) { msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type)); error = EINVAL; goto done3; } /* * Copy in the message body */ next = msghdr->msg_spot; while (msgsz > 0) { size_t tlen; if (msgsz > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz; if (next <= -1) panic("next too low #2"); if (next >= msginfo.msgseg) panic("next out of range #2"); mtx_unlock(&msq_mtx); if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz], tlen)) != 0) { mtx_lock(&msq_mtx); DPRINTF(("error %d copying in message segment\n", error)); msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); goto done3; } mtx_lock(&msq_mtx); msgsz -= tlen; msgp = (const char *)msgp + tlen; next = msgmaps[next].next; } if (next != -1) panic("didn't use all the msg segments"); /* * We've got the message. Unlock the msqid_ds. */ msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; /* * Make sure that the msqid_ds is still allocated. */ if (msqkptr->u.msg_qbytes == 0) { msg_freehdr(msghdr); wakeup(msqkptr); error = EIDRM; goto done3; } #ifdef MAC /* * Note: Since the task/thread allocates the msghdr and usually * primes it with its own MAC label, for a majority of policies, it * won't be necessary to check whether the msghdr has access * permissions to the msgq. The mac_sysvmsq_check_msqsnd check would * suffice in that case. However, this hook may be required where * individual policies derive a non-identical label for the msghdr * from the current thread label and may want to check the msghdr * enqueue permissions, along with read/write permissions to the * msgq. */ error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr); if (error != 0) { msg_freehdr(msghdr); wakeup(msqkptr); goto done3; } #endif /* * Put the message into the queue */ if (msqkptr->u.msg_first == NULL) { msqkptr->u.msg_first = msghdr; msqkptr->u.msg_last = msghdr; } else { msqkptr->u.msg_last->msg_next = msghdr; msqkptr->u.msg_last = msghdr; } msqkptr->u.msg_last->msg_next = NULL; msqkptr->u.msg_cbytes += msghdr->msg_ts; msqkptr->u.msg_qnum++; msqkptr->u.msg_lspid = td->td_proc->p_pid; msqkptr->u.msg_stime = time_second; wakeup(msqkptr); td->td_retval[0] = 0; done3: #ifdef RACCT if (racct_enable && error != 0) { PROC_LOCK(td->td_proc); racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz); PROC_UNLOCK(td->td_proc); } #endif done2: mtx_unlock(&msq_mtx); return (error); } int sys_msgsnd(struct thread *td, struct msgsnd_args *uap) { int error; long mtype; DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp, uap->msgsz, uap->msgflg)); if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) { DPRINTF(("error %d copying the message type\n", error)); return (error); } return (kern_msgsnd(td, uap->msqid, (const char *)uap->msgp + sizeof(mtype), uap->msgsz, uap->msgflg, mtype)); } #ifndef _SYS_SYSPROTO_H_ struct msgrcv_args { int msqid; void *msgp; size_t msgsz; long msgtyp; int msgflg; }; #endif /* XXX msgp is actually mtext. */ int kern_msgrcv(struct thread *td, int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg, long *mtype) { size_t len; struct msqid_kernel *msqkptr; struct msg *msghdr; struct prison *rpr; int msqix, error = 0; short next; rpr = msg_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); AUDIT_ARG_SVIPC_ID(msqid); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); return (EINVAL); } msqkptr = &msqids[msqix]; mtx_lock(&msq_mtx); AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such message queue id\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } if ((error = msq_prison_cansee(rpr, msqkptr))) { DPRINTF(("requester can't see prison\n")); goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr); if (error != 0) goto done2; #endif msghdr = NULL; while (msghdr == NULL) { if (msgtyp == 0) { msghdr = msqkptr->u.msg_first; if (msghdr != NULL) { if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { DPRINTF(("first message on the queue " "is too big (want %zu, got %d)\n", msgsz, msghdr->msg_ts)); error = E2BIG; goto done2; } #ifdef MAC error = mac_sysvmsq_check_msgrcv(td->td_ucred, msghdr); if (error != 0) goto done2; #endif if (msqkptr->u.msg_first == msqkptr->u.msg_last) { msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; } else { msqkptr->u.msg_first = msghdr->msg_next; if (msqkptr->u.msg_first == NULL) panic("msg_first/last screwed up #1"); } } } else { struct msg *previous; struct msg **prev; previous = NULL; prev = &(msqkptr->u.msg_first); while ((msghdr = *prev) != NULL) { /* * Is this message's type an exact match or is * this message's type less than or equal to * the absolute value of a negative msgtyp? * Note that the second half of this test can * NEVER be true if msgtyp is positive since * msg_type is always positive! */ if (msgtyp == msghdr->msg_type || msghdr->msg_type <= -msgtyp) { DPRINTF(("found message type %ld, " "requested %ld\n", msghdr->msg_type, msgtyp)); if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { DPRINTF(("requested message " "on the queue is too big " "(want %zu, got %hu)\n", msgsz, msghdr->msg_ts)); error = E2BIG; goto done2; } #ifdef MAC error = mac_sysvmsq_check_msgrcv( td->td_ucred, msghdr); if (error != 0) goto done2; #endif *prev = msghdr->msg_next; if (msghdr == msqkptr->u.msg_last) { if (previous == NULL) { if (prev != &msqkptr->u.msg_first) panic("msg_first/last screwed up #2"); msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; } else { if (prev == &msqkptr->u.msg_first) panic("msg_first/last screwed up #3"); msqkptr->u.msg_last = previous; } } break; } previous = msghdr; prev = &(msghdr->msg_next); } } /* * We've either extracted the msghdr for the appropriate * message or there isn't one. * If there is one then bail out of this loop. */ if (msghdr != NULL) break; /* * Hmph! No message found. Does the user want to wait? */ if ((msgflg & IPC_NOWAIT) != 0) { DPRINTF(("no appropriate message found (msgtyp=%ld)\n", msgtyp)); /* The SVID says to return ENOMSG. */ error = ENOMSG; goto done2; } /* * Wait for something to happen */ DPRINTF(("msgrcv: goodnight\n")); error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH, "msgrcv", 0); DPRINTF(("msgrcv: good morning (error=%d)\n", error)); if (error != 0) { DPRINTF(("msgrcv: interrupted system call\n")); error = EINTR; goto done2; } /* * Make sure that the msq queue still exists */ if (msqkptr->u.msg_qbytes == 0 || msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("msqid deleted\n")); error = EIDRM; goto done2; } } /* * Return the message to the user. * * First, do the bookkeeping (before we risk being interrupted). */ msqkptr->u.msg_cbytes -= msghdr->msg_ts; msqkptr->u.msg_qnum--; msqkptr->u.msg_lrpid = td->td_proc->p_pid; msqkptr->u.msg_rtime = time_second; racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1); racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts); /* * Make msgsz the actual amount that we'll be returning. * Note that this effectively truncates the message if it is too long * (since msgsz is never increased). */ DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz, msghdr->msg_ts)); if (msgsz > msghdr->msg_ts) msgsz = msghdr->msg_ts; *mtype = msghdr->msg_type; /* * Return the segments to the user */ next = msghdr->msg_spot; for (len = 0; len < msgsz; len += msginfo.msgssz) { size_t tlen; if (msgsz - len > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz - len; if (next <= -1) panic("next too low #3"); if (next >= msginfo.msgseg) panic("next out of range #3"); mtx_unlock(&msq_mtx); error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen); mtx_lock(&msq_mtx); if (error != 0) { DPRINTF(("error (%d) copying out message segment\n", error)); msg_freehdr(msghdr); wakeup(msqkptr); goto done2; } msgp = (char *)msgp + tlen; next = msgmaps[next].next; } /* * Done, return the actual number of bytes copied out. */ msg_freehdr(msghdr); wakeup(msqkptr); td->td_retval[0] = msgsz; done2: mtx_unlock(&msq_mtx); return (error); } int sys_msgrcv(struct thread *td, struct msgrcv_args *uap) { int error; long mtype; DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid, uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg)); if ((error = kern_msgrcv(td, uap->msqid, (char *)uap->msgp + sizeof(mtype), uap->msgsz, uap->msgtyp, uap->msgflg, &mtype)) != 0) return (error); if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0) DPRINTF(("error %d copying the message type\n", error)); return (error); } static int sysctl_msqids(SYSCTL_HANDLER_ARGS) { struct msqid_kernel tmsqk; struct prison *pr, *rpr; int error, i; pr = req->td->td_ucred->cr_prison; rpr = msg_find_prison(req->td->td_ucred); error = 0; for (i = 0; i < msginfo.msgmni; i++) { mtx_lock(&msq_mtx); if (msqids[i].u.msg_qbytes == 0 || rpr == NULL || msq_prison_cansee(rpr, &msqids[i]) != 0) bzero(&tmsqk, sizeof(tmsqk)); else { tmsqk = msqids[i]; if (tmsqk.cred->cr_prison != pr) tmsqk.u.msg_perm.key = IPC_PRIVATE; } mtx_unlock(&msq_mtx); error = SYSCTL_OUT(req, &tmsqk, sizeof(tmsqk)); if (error != 0) break; } return (error); } SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0, "Maximum message size"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0, "Number of message queue identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0, "Maximum number of bytes in a queue"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0, "Maximum number of messages in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0, "Size of a message segment"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0, "Number of message segments"); SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_msqids, "", "Message queue IDs"); static int msg_prison_check(void *obj, void *data) { struct prison *pr = obj; struct prison *prpr; struct vfsoptlist *opts = data; int error, jsys; /* * sysvmsg is a jailsys integer. * It must be "disable" if the parent jail is disabled. */ error = vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)); if (error != ENOENT) { if (error != 0) return (error); switch (jsys) { case JAIL_SYS_DISABLE: break; case JAIL_SYS_NEW: case JAIL_SYS_INHERIT: prison_lock(pr->pr_parent); prpr = osd_jail_get(pr->pr_parent, msg_prison_slot); prison_unlock(pr->pr_parent); if (prpr == NULL) return (EPERM); break; default: return (EINVAL); } } return (0); } static int msg_prison_set(void *obj, void *data) { struct prison *pr = obj; struct prison *tpr, *orpr, *nrpr, *trpr; struct vfsoptlist *opts = data; void *rsv; int jsys, descend; /* * sysvmsg controls which jail is the root of the associated msgs (this * jail or same as the parent), or if the feature is available at all. */ if (vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)) == ENOENT) jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0) ? JAIL_SYS_INHERIT : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0) ? JAIL_SYS_DISABLE : -1; if (jsys == JAIL_SYS_DISABLE) { prison_lock(pr); orpr = osd_jail_get(pr, msg_prison_slot); if (orpr != NULL) osd_jail_del(pr, msg_prison_slot); prison_unlock(pr); if (orpr != NULL) { if (orpr == pr) msg_prison_cleanup(pr); /* Disable all child jails as well. */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, msg_prison_slot); if (trpr != NULL) { osd_jail_del(tpr, msg_prison_slot); prison_unlock(tpr); if (trpr == tpr) msg_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } else if (jsys != -1) { if (jsys == JAIL_SYS_NEW) nrpr = pr; else { prison_lock(pr->pr_parent); nrpr = osd_jail_get(pr->pr_parent, msg_prison_slot); prison_unlock(pr->pr_parent); } rsv = osd_reserve(msg_prison_slot); prison_lock(pr); orpr = osd_jail_get(pr, msg_prison_slot); if (orpr != nrpr) (void)osd_jail_set_reserved(pr, msg_prison_slot, rsv, nrpr); else osd_free_reserved(rsv); prison_unlock(pr); if (orpr != nrpr) { if (orpr == pr) msg_prison_cleanup(pr); if (orpr != NULL) { /* Change child jails matching the old root, */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, msg_prison_slot); if (trpr == orpr) { (void)osd_jail_set(tpr, msg_prison_slot, nrpr); prison_unlock(tpr); if (trpr == tpr) msg_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } } return (0); } static int msg_prison_get(void *obj, void *data) { struct prison *pr = obj; struct prison *rpr; struct vfsoptlist *opts = data; int error, jsys; /* Set sysvmsg based on the jail's root prison. */ prison_lock(pr); rpr = osd_jail_get(pr, msg_prison_slot); prison_unlock(pr); jsys = rpr == NULL ? JAIL_SYS_DISABLE : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, "sysvmsg", &jsys, sizeof(jsys)); if (error == ENOENT) error = 0; return (error); } static int msg_prison_remove(void *obj, void *data __unused) { struct prison *pr = obj; struct prison *rpr; prison_lock(pr); rpr = osd_jail_get(pr, msg_prison_slot); prison_unlock(pr); if (rpr == pr) msg_prison_cleanup(pr); return (0); } static void msg_prison_cleanup(struct prison *pr) { struct msqid_kernel *msqkptr; int i; /* Remove any msqs that belong to this jail. */ mtx_lock(&msq_mtx); for (i = 0; i < msginfo.msgmni; i++) { msqkptr = &msqids[i]; if (msqkptr->u.msg_qbytes != 0 && msqkptr->cred != NULL && msqkptr->cred->cr_prison == pr) msq_remove(msqkptr); } mtx_unlock(&msq_mtx); } SYSCTL_JAIL_PARAM_SYS_NODE(sysvmsg, CTLFLAG_RW, "SYSV message queues"); #ifdef COMPAT_FREEBSD32 int freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap) { #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) AUDIT_ARG_SVIPC_WHICH(uap->which); switch (uap->which) { case 0: return (freebsd7_freebsd32_msgctl(td, (struct freebsd7_freebsd32_msgctl_args *)&uap->a2)); case 2: return (freebsd32_msgsnd(td, (struct freebsd32_msgsnd_args *)&uap->a2)); case 3: return (freebsd32_msgrcv(td, (struct freebsd32_msgrcv_args *)&uap->a2)); default: return (sys_msgsys(td, (struct msgsys_args *)uap)); } #else return (nosys(td, NULL)); #endif } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) int freebsd7_freebsd32_msgctl(struct thread *td, struct freebsd7_freebsd32_msgctl_args *uap) { struct msqid_ds msqbuf; struct msqid_ds32_old msqbuf32; int error; if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32)); if (error) return (error); freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm); PTRIN_CP(msqbuf32, msqbuf, msg_first); PTRIN_CP(msqbuf32, msqbuf, msg_last); CP(msqbuf32, msqbuf, msg_cbytes); CP(msqbuf32, msqbuf, msg_qnum); CP(msqbuf32, msqbuf, msg_qbytes); CP(msqbuf32, msqbuf, msg_lspid); CP(msqbuf32, msqbuf, msg_lrpid); CP(msqbuf32, msqbuf, msg_stime); CP(msqbuf32, msqbuf, msg_rtime); CP(msqbuf32, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { bzero(&msqbuf32, sizeof(msqbuf32)); freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm); PTROUT_CP(msqbuf, msqbuf32, msg_first); PTROUT_CP(msqbuf, msqbuf32, msg_last); CP(msqbuf, msqbuf32, msg_cbytes); CP(msqbuf, msqbuf32, msg_qnum); CP(msqbuf, msqbuf32, msg_qbytes); CP(msqbuf, msqbuf32, msg_lspid); CP(msqbuf, msqbuf32, msg_lrpid); CP(msqbuf, msqbuf32, msg_stime); CP(msqbuf, msqbuf32, msg_rtime); CP(msqbuf, msqbuf32, msg_ctime); error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32)); } return (error); } #endif int freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap) { struct msqid_ds msqbuf; struct msqid_ds32 msqbuf32; int error; if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32)); if (error) return (error); freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm); PTRIN_CP(msqbuf32, msqbuf, msg_first); PTRIN_CP(msqbuf32, msqbuf, msg_last); CP(msqbuf32, msqbuf, msg_cbytes); CP(msqbuf32, msqbuf, msg_qnum); CP(msqbuf32, msqbuf, msg_qbytes); CP(msqbuf32, msqbuf, msg_lspid); CP(msqbuf32, msqbuf, msg_lrpid); CP(msqbuf32, msqbuf, msg_stime); CP(msqbuf32, msqbuf, msg_rtime); CP(msqbuf32, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm); PTROUT_CP(msqbuf, msqbuf32, msg_first); PTROUT_CP(msqbuf, msqbuf32, msg_last); CP(msqbuf, msqbuf32, msg_cbytes); CP(msqbuf, msqbuf32, msg_qnum); CP(msqbuf, msqbuf32, msg_qbytes); CP(msqbuf, msqbuf32, msg_lspid); CP(msqbuf, msqbuf32, msg_lrpid); CP(msqbuf, msqbuf32, msg_stime); CP(msqbuf, msqbuf32, msg_rtime); CP(msqbuf, msqbuf32, msg_ctime); error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32)); } return (error); } int freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap) { const void *msgp; long mtype; int32_t mtype32; int error; msgp = PTRIN(uap->msgp); if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0) return (error); mtype = mtype32; return (kern_msgsnd(td, uap->msqid, (const char *)msgp + sizeof(mtype32), uap->msgsz, uap->msgflg, mtype)); } int freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap) { void *msgp; long mtype; int32_t mtype32; int error; msgp = PTRIN(uap->msgp); if ((error = kern_msgrcv(td, uap->msqid, (char *)msgp + sizeof(mtype32), uap->msgsz, uap->msgtyp, uap->msgflg, &mtype)) != 0) return (error); mtype32 = (int32_t)mtype; return (copyout(&mtype32, msgp, sizeof(mtype32))); } #endif #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *msgcalls[] = { (sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget, (sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv }; /* * Entry point for all MSG calls. * * XXX actually varargs. * struct msgsys_args { * int which; * int a2; * int a3; * int a4; * int a5; * int a6; * } *uap; */ int sys_msgsys(struct thread *td, struct msgsys_args *uap) { int error; AUDIT_ARG_SVIPC_WHICH(uap->which); if (uap->which < 0 || uap->which >= nitems(msgcalls)) return (EINVAL); error = (*msgcalls[uap->which])(td, &uap->a2); return (error); } #ifndef CP #define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0) #endif #ifndef _SYS_SYSPROTO_H_ struct freebsd7_msgctl_args { int msqid; int cmd; struct msqid_ds_old *buf; }; #endif int freebsd7_msgctl(struct thread *td, struct freebsd7_msgctl_args *uap) { struct msqid_ds_old msqold; struct msqid_ds msqbuf; int error; DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd, uap->buf)); if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqold, sizeof(msqold)); if (error) return (error); ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm); CP(msqold, msqbuf, msg_first); CP(msqold, msqbuf, msg_last); CP(msqold, msqbuf, msg_cbytes); CP(msqold, msqbuf, msg_qnum); CP(msqold, msqbuf, msg_qbytes); CP(msqold, msqbuf, msg_lspid); CP(msqold, msqbuf, msg_lrpid); CP(msqold, msqbuf, msg_stime); CP(msqold, msqbuf, msg_rtime); CP(msqold, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { bzero(&msqold, sizeof(msqold)); ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm); CP(msqbuf, msqold, msg_first); CP(msqbuf, msqold, msg_last); CP(msqbuf, msqold, msg_cbytes); CP(msqbuf, msqold, msg_qnum); CP(msqbuf, msqold, msg_qbytes); CP(msqbuf, msqold, msg_lspid); CP(msqbuf, msqold, msg_lrpid); CP(msqbuf, msqold, msg_stime); CP(msqbuf, msqold, msg_rtime); CP(msqbuf, msqold, msg_ctime); error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old)); } return (error); } #undef CP #endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 || COMPAT_FREEBSD7 */ Index: head/sys/kern/sysv_sem.c =================================================================== --- head/sys/kern/sysv_sem.c (revision 326270) +++ head/sys/kern/sysv_sem.c (revision 326271) @@ -1,1972 +1,1974 @@ /*- * Implementation of SVID semaphores * * Author: Daniel Boulet * * This software is provided ``AS IS'' without any warranties of any kind. */ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003-2005 McAfee, Inc. * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(sysv_sem, "System V semaphores support"); static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores"); #ifdef SEM_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif static int seminit(void); static int sysvsem_modload(struct module *, int, void *); static int semunload(void); static void semexit_myhook(void *arg, struct proc *p); static int sysctl_sema(SYSCTL_HANDLER_ARGS); static int semvalid(int semid, struct prison *rpr, struct semid_kernel *semakptr); static void sem_remove(int semidx, struct ucred *cred); static struct prison *sem_find_prison(struct ucred *); static int sem_prison_cansee(struct prison *, struct semid_kernel *); static int sem_prison_check(void *, void *); static int sem_prison_set(void *, void *); static int sem_prison_get(void *, void *); static int sem_prison_remove(void *, void *); static void sem_prison_cleanup(struct prison *); #ifndef _SYS_SYSPROTO_H_ struct __semctl_args; int __semctl(struct thread *td, struct __semctl_args *uap); struct semget_args; int semget(struct thread *td, struct semget_args *uap); struct semop_args; int semop(struct thread *td, struct semop_args *uap); #endif static struct sem_undo *semu_alloc(struct thread *td); static int semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid, int semseq, int semnum, int adjval); static void semundo_clear(int semid, int semnum); static struct mtx sem_mtx; /* semaphore global lock */ static struct mtx sem_undo_mtx; static int semtot = 0; static struct semid_kernel *sema; /* semaphore id pool */ static struct mtx *sema_mtx; /* semaphore id pool mutexes*/ static struct sem *sem; /* semaphore pool */ LIST_HEAD(, sem_undo) semu_list; /* list of active undo structures */ LIST_HEAD(, sem_undo) semu_free_list; /* list of free undo structures */ static int *semu; /* undo structure pool */ static eventhandler_tag semexit_tag; static unsigned sem_prison_slot; /* prison OSD slot */ #define SEMUNDO_MTX sem_undo_mtx #define SEMUNDO_LOCK() mtx_lock(&SEMUNDO_MTX); #define SEMUNDO_UNLOCK() mtx_unlock(&SEMUNDO_MTX); #define SEMUNDO_LOCKASSERT(how) mtx_assert(&SEMUNDO_MTX, (how)); struct sem { u_short semval; /* semaphore value */ pid_t sempid; /* pid of last operation */ u_short semncnt; /* # awaiting semval > cval */ u_short semzcnt; /* # awaiting semval = 0 */ }; /* * Undo structure (one per process) */ struct sem_undo { LIST_ENTRY(sem_undo) un_next; /* ptr to next active undo structure */ struct proc *un_proc; /* owner of this structure */ short un_cnt; /* # of active entries */ struct undo { short un_adjval; /* adjust on exit values */ short un_num; /* semaphore # */ int un_id; /* semid */ unsigned short un_seq; } un_ent[1]; /* undo entries */ }; /* * Configuration parameters */ #ifndef SEMMNI #define SEMMNI 50 /* # of semaphore identifiers */ #endif #ifndef SEMMNS #define SEMMNS 340 /* # of semaphores in system */ #endif #ifndef SEMUME #define SEMUME 50 /* max # of undo entries per process */ #endif #ifndef SEMMNU #define SEMMNU 150 /* # of undo structures in system */ #endif /* shouldn't need tuning */ #ifndef SEMMSL #define SEMMSL SEMMNS /* max # of semaphores per id */ #endif #ifndef SEMOPM #define SEMOPM 100 /* max # of operations per semop call */ #endif #define SEMVMX 32767 /* semaphore maximum value */ #define SEMAEM 16384 /* adjust on exit max value */ /* * Due to the way semaphore memory is allocated, we have to ensure that * SEMUSZ is properly aligned. */ #define SEM_ALIGN(bytes) roundup2(bytes, sizeof(long)) /* actual size of an undo structure */ #define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME])) /* * Macro to find a particular sem_undo vector */ #define SEMU(ix) \ ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz)) /* * semaphore info struct */ struct seminfo seminfo = { SEMMNI, /* # of semaphore identifiers */ SEMMNS, /* # of semaphores in system */ SEMMNU, /* # of undo structures in system */ SEMMSL, /* max # of semaphores per id */ SEMOPM, /* max # of operations per semop call */ SEMUME, /* max # of undo entries per process */ SEMUSZ, /* size in bytes of undo structure */ SEMVMX, /* semaphore maximum value */ SEMAEM /* adjust on exit max value */ }; SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0, "Number of semaphore identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0, "Maximum number of semaphores in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0, "Maximum number of undo structures in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RWTUN, &seminfo.semmsl, 0, "Max semaphores per id"); SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0, "Max operations per semop call"); SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0, "Max undo entries per process"); SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0, "Size in bytes of undo structure"); SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RWTUN, &seminfo.semvmx, 0, "Semaphore maximum value"); SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RWTUN, &seminfo.semaem, 0, "Adjust on exit max value"); SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_sema, "", "Semaphore id pool"); static struct syscall_helper_data sem_syscalls[] = { SYSCALL_INIT_HELPER(__semctl), SYSCALL_INIT_HELPER(semget), SYSCALL_INIT_HELPER(semop), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL_INIT_HELPER(semsys), SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl), #endif SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include static struct syscall_helper_data sem32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_semctl), SYSCALL32_INIT_HELPER_COMPAT(semget), SYSCALL32_INIT_HELPER_COMPAT(semop), SYSCALL32_INIT_HELPER(freebsd32_semsys), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl), #endif SYSCALL_INIT_LAST }; #endif static int seminit(void) { struct prison *pr; void **rsv; int i, error; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_CHECK] = sem_prison_check, [PR_METHOD_SET] = sem_prison_set, [PR_METHOD_GET] = sem_prison_get, [PR_METHOD_REMOVE] = sem_prison_remove, }; sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK); sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM, M_WAITOK); sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM, M_WAITOK | M_ZERO); semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK); for (i = 0; i < seminfo.semmni; i++) { sema[i].u.sem_base = 0; sema[i].u.sem_perm.mode = 0; sema[i].u.sem_perm.seq = 0; #ifdef MAC mac_sysvsem_init(&sema[i]); #endif } for (i = 0; i < seminfo.semmni; i++) mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF); LIST_INIT(&semu_free_list); for (i = 0; i < seminfo.semmnu; i++) { struct sem_undo *suptr = SEMU(i); suptr->un_proc = NULL; LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); } LIST_INIT(&semu_list); mtx_init(&sem_mtx, "sem", NULL, MTX_DEF); mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF); semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL, EVENTHANDLER_PRI_ANY); /* Set current prisons according to their allow.sysvipc. */ sem_prison_slot = osd_jail_register(NULL, methods); rsv = osd_reserve(sem_prison_slot); prison_lock(&prison0); (void)osd_jail_set_reserved(&prison0, sem_prison_slot, rsv, &prison0); prison_unlock(&prison0); rsv = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { if (rsv == NULL) rsv = osd_reserve(sem_prison_slot); prison_lock(pr); if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) { (void)osd_jail_set_reserved(pr, sem_prison_slot, rsv, &prison0); rsv = NULL; } prison_unlock(pr); } if (rsv != NULL) osd_free_reserved(rsv); sx_sunlock(&allprison_lock); error = syscall_helper_register(sem_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(sem32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int semunload(void) { int i; /* XXXKIB */ if (semtot != 0) return (EBUSY); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(sem32_syscalls); #endif syscall_helper_unregister(sem_syscalls); EVENTHANDLER_DEREGISTER(process_exit, semexit_tag); if (sem_prison_slot != 0) osd_jail_deregister(sem_prison_slot); #ifdef MAC for (i = 0; i < seminfo.semmni; i++) mac_sysvsem_destroy(&sema[i]); #endif free(sem, M_SEM); free(sema, M_SEM); free(semu, M_SEM); for (i = 0; i < seminfo.semmni; i++) mtx_destroy(&sema_mtx[i]); free(sema_mtx, M_SEM); mtx_destroy(&sem_mtx); mtx_destroy(&sem_undo_mtx); return (0); } static int sysvsem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = seminit(); if (error != 0) semunload(); break; case MOD_UNLOAD: error = semunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvsem_mod = { "sysvsem", &sysvsem_modload, NULL }; DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sysvsem, 1); /* * Allocate a new sem_undo structure for a process * (returns ptr to structure or NULL if no more room) */ static struct sem_undo * semu_alloc(struct thread *td) { struct sem_undo *suptr; SEMUNDO_LOCKASSERT(MA_OWNED); if ((suptr = LIST_FIRST(&semu_free_list)) == NULL) return (NULL); LIST_REMOVE(suptr, un_next); LIST_INSERT_HEAD(&semu_list, suptr, un_next); suptr->un_cnt = 0; suptr->un_proc = td->td_proc; return (suptr); } static int semu_try_free(struct sem_undo *suptr) { SEMUNDO_LOCKASSERT(MA_OWNED); if (suptr->un_cnt != 0) return (0); LIST_REMOVE(suptr, un_next); LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); return (1); } /* * Adjust a particular entry for a particular proc */ static int semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid, int semseq, int semnum, int adjval) { struct proc *p = td->td_proc; struct sem_undo *suptr; struct undo *sunptr; int i; SEMUNDO_LOCKASSERT(MA_OWNED); /* Look for and remember the sem_undo if the caller doesn't provide it */ suptr = *supptr; if (suptr == NULL) { LIST_FOREACH(suptr, &semu_list, un_next) { if (suptr->un_proc == p) { *supptr = suptr; break; } } if (suptr == NULL) { if (adjval == 0) return(0); suptr = semu_alloc(td); if (suptr == NULL) return (ENOSPC); *supptr = suptr; } } /* * Look for the requested entry and adjust it (delete if adjval becomes * 0). */ sunptr = &suptr->un_ent[0]; for (i = 0; i < suptr->un_cnt; i++, sunptr++) { if (sunptr->un_id != semid || sunptr->un_num != semnum) continue; if (adjval != 0) { adjval += sunptr->un_adjval; if (adjval > seminfo.semaem || adjval < -seminfo.semaem) return (ERANGE); } sunptr->un_adjval = adjval; if (sunptr->un_adjval == 0) { suptr->un_cnt--; if (i < suptr->un_cnt) suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; if (suptr->un_cnt == 0) semu_try_free(suptr); } return (0); } /* Didn't find the right entry - create it */ if (adjval == 0) return (0); if (adjval > seminfo.semaem || adjval < -seminfo.semaem) return (ERANGE); if (suptr->un_cnt != seminfo.semume) { sunptr = &suptr->un_ent[suptr->un_cnt]; suptr->un_cnt++; sunptr->un_adjval = adjval; sunptr->un_id = semid; sunptr->un_num = semnum; sunptr->un_seq = semseq; } else return (EINVAL); return (0); } static void semundo_clear(int semid, int semnum) { struct sem_undo *suptr, *suptr1; struct undo *sunptr; int i; SEMUNDO_LOCKASSERT(MA_OWNED); LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) { sunptr = &suptr->un_ent[0]; for (i = 0; i < suptr->un_cnt; i++, sunptr++) { if (sunptr->un_id != semid) continue; if (semnum == -1 || sunptr->un_num == semnum) { suptr->un_cnt--; if (i < suptr->un_cnt) { suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; continue; } semu_try_free(suptr); } if (semnum != -1) break; } } } static int semvalid(int semid, struct prison *rpr, struct semid_kernel *semakptr) { return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) || sem_prison_cansee(rpr, semakptr) ? EINVAL : 0); } static void sem_remove(int semidx, struct ucred *cred) { struct semid_kernel *semakptr; int i; KASSERT(semidx >= 0 && semidx < seminfo.semmni, ("semidx out of bounds")); semakptr = &sema[semidx]; semakptr->u.sem_perm.cuid = cred ? cred->cr_uid : 0; semakptr->u.sem_perm.uid = cred ? cred->cr_uid : 0; semakptr->u.sem_perm.mode = 0; racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems); crfree(semakptr->cred); semakptr->cred = NULL; SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); #ifdef MAC mac_sysvsem_cleanup(semakptr); #endif wakeup(semakptr); for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && sema[i].u.sem_base > semakptr->u.sem_base) mtx_lock_flags(&sema_mtx[i], LOP_DUPOK); } for (i = semakptr->u.sem_base - sem; i < semtot; i++) sem[i] = sem[i + semakptr->u.sem_nsems]; for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && sema[i].u.sem_base > semakptr->u.sem_base) { sema[i].u.sem_base -= semakptr->u.sem_nsems; mtx_unlock(&sema_mtx[i]); } } semtot -= semakptr->u.sem_nsems; } static struct prison * sem_find_prison(struct ucred *cred) { struct prison *pr, *rpr; pr = cred->cr_prison; prison_lock(pr); rpr = osd_jail_get(pr, sem_prison_slot); prison_unlock(pr); return rpr; } static int sem_prison_cansee(struct prison *rpr, struct semid_kernel *semakptr) { if (semakptr->cred == NULL || !(rpr == semakptr->cred->cr_prison || prison_ischild(rpr, semakptr->cred->cr_prison))) return (EINVAL); return (0); } /* * Note that the user-mode half of this passes a union, not a pointer. */ #ifndef _SYS_SYSPROTO_H_ struct __semctl_args { int semid; int semnum; int cmd; union semun *arg; }; #endif int sys___semctl(struct thread *td, struct __semctl_args *uap) { struct semid_ds dsbuf; union semun arg, semun; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(arg.buf, &dsbuf, sizeof(dsbuf)); if (error) return (error); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = arg.array; break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: error = copyout(&dsbuf, arg.buf, sizeof(dsbuf)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } int kern_semctl(struct thread *td, int semid, int semnum, int cmd, union semun *arg, register_t *rval) { u_short *array; struct ucred *cred = td->td_ucred; int i, error; struct prison *rpr; struct semid_ds *sbuf; struct semid_kernel *semakptr; struct mtx *sema_mtxp; u_short usval, count; int semidx; DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n", semid, semnum, cmd, arg)); AUDIT_ARG_SVIPC_CMD(cmd); AUDIT_ARG_SVIPC_ID(semid); rpr = sem_find_prison(td->td_ucred); if (sem == NULL) return (ENOSYS); array = NULL; switch(cmd) { case SEM_STAT: /* * For this command we assume semid is an array index * rather than an IPC id. */ if (semid < 0 || semid >= seminfo.semmni) return (EINVAL); semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) { error = EINVAL; goto done2; } if ((error = sem_prison_cansee(rpr, semakptr))) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; #ifdef MAC error = mac_sysvsem_check_semctl(cred, semakptr, cmd); if (error != 0) goto done2; #endif bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); if (cred->cr_prison != semakptr->cred->cr_prison) arg->buf->sem_perm.key = IPC_PRIVATE; *rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm); mtx_unlock(sema_mtxp); return (0); } semidx = IPCID_TO_IX(semid); if (semidx < 0 || semidx >= seminfo.semmni) return (EINVAL); semakptr = &sema[semidx]; sema_mtxp = &sema_mtx[semidx]; if (cmd == IPC_RMID) mtx_lock(&sem_mtx); mtx_lock(sema_mtxp); #ifdef MAC error = mac_sysvsem_check_semctl(cred, semakptr, cmd); if (error != 0) goto done2; #endif error = 0; *rval = 0; switch (cmd) { case IPC_RMID: if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; sem_remove(semidx, cred); break; case IPC_SET: AUDIT_ARG_SVIPC_PERM(&arg->buf->sem_perm); if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; sbuf = arg->buf; semakptr->u.sem_perm.uid = sbuf->sem_perm.uid; semakptr->u.sem_perm.gid = sbuf->sem_perm.gid; semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode & ~0777) | (sbuf->sem_perm.mode & 0777); semakptr->u.sem_ctime = time_second; break; case IPC_STAT: if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); if (cred->cr_prison != semakptr->cred->cr_prison) arg->buf->sem_perm.key = IPC_PRIVATE; break; case GETNCNT: if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semncnt; break; case GETPID: if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].sempid; break; case GETVAL: if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semval; break; case GETALL: /* * Unfortunately, callers of this function don't know * in advance how many semaphores are in this set. * While we could just allocate the maximum size array * and pass the actual size back to the caller, that * won't work for SETALL since we can't copyin() more * data than the user specified as we may return a * spurious EFAULT. * * Note that the number of semaphores in a set is * fixed for the life of that set. The only way that * the 'count' could change while are blocked in * malloc() is if this semaphore set were destroyed * and a new one created with the same index. * However, semvalid() will catch that due to the * sequence number unless exactly 0x8000 (or a * multiple thereof) semaphore sets for the same index * are created and destroyed while we are in malloc! * */ count = semakptr->u.sem_nsems; mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); mtx_lock(sema_mtxp); if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; for (i = 0; i < semakptr->u.sem_nsems; i++) array[i] = semakptr->u.sem_base[i].semval; mtx_unlock(sema_mtxp); error = copyout(array, arg->array, count * sizeof(*array)); mtx_lock(sema_mtxp); break; case GETZCNT: if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semzcnt; break; case SETVAL: if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } if (arg->val < 0 || arg->val > seminfo.semvmx) { error = ERANGE; goto done2; } semakptr->u.sem_base[semnum].semval = arg->val; SEMUNDO_LOCK(); semundo_clear(semidx, semnum); SEMUNDO_UNLOCK(); wakeup(semakptr); break; case SETALL: /* * See comment on GETALL for why 'count' shouldn't change * and why we require a userland buffer. */ count = semakptr->u.sem_nsems; mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); error = copyin(arg->array, array, count * sizeof(*array)); mtx_lock(sema_mtxp); if (error) break; if ((error = semvalid(semid, rpr, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; for (i = 0; i < semakptr->u.sem_nsems; i++) { usval = array[i]; if (usval > seminfo.semvmx) { error = ERANGE; break; } semakptr->u.sem_base[i].semval = usval; } SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); wakeup(semakptr); break; default: error = EINVAL; break; } done2: mtx_unlock(sema_mtxp); if (cmd == IPC_RMID) mtx_unlock(&sem_mtx); if (array != NULL) free(array, M_TEMP); return(error); } #ifndef _SYS_SYSPROTO_H_ struct semget_args { key_t key; int nsems; int semflg; }; #endif int sys_semget(struct thread *td, struct semget_args *uap) { int semid, error = 0; int key = uap->key; int nsems = uap->nsems; int semflg = uap->semflg; struct ucred *cred = td->td_ucred; DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg)); AUDIT_ARG_VALUE(semflg); if (sem_find_prison(cred) == NULL) return (ENOSYS); mtx_lock(&sem_mtx); if (key != IPC_PRIVATE) { for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) && sema[semid].cred != NULL && sema[semid].cred->cr_prison == cred->cr_prison && sema[semid].u.sem_perm.key == key) break; } if (semid < seminfo.semmni) { AUDIT_ARG_SVIPC_ID(semid); DPRINTF(("found public key\n")); if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { DPRINTF(("not exclusive\n")); error = EEXIST; goto done2; } if ((error = ipcperm(td, &sema[semid].u.sem_perm, semflg & 0700))) { goto done2; } if (nsems > 0 && sema[semid].u.sem_nsems < nsems) { DPRINTF(("too small\n")); error = EINVAL; goto done2; } #ifdef MAC error = mac_sysvsem_check_semget(cred, &sema[semid]); if (error != 0) goto done2; #endif goto found; } } DPRINTF(("need to allocate the semid_kernel\n")); if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { if (nsems <= 0 || nsems > seminfo.semmsl) { DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems, seminfo.semmsl)); error = EINVAL; goto done2; } if (nsems > seminfo.semmns - semtot) { DPRINTF(( "not enough semaphores left (need %d, got %d)\n", nsems, seminfo.semmns - semtot)); error = ENOSPC; goto done2; } for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0) break; } if (semid == seminfo.semmni) { DPRINTF(("no more semid_kernel's available\n")); error = ENOSPC; goto done2; } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_NSEM, nsems); PROC_UNLOCK(td->td_proc); if (error != 0) { error = ENOSPC; goto done2; } } #endif DPRINTF(("semid %d is available\n", semid)); mtx_lock(&sema_mtx[semid]); KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0, ("Lost semaphore %d", semid)); sema[semid].u.sem_perm.key = key; sema[semid].u.sem_perm.cuid = cred->cr_uid; sema[semid].u.sem_perm.uid = cred->cr_uid; sema[semid].u.sem_perm.cgid = cred->cr_gid; sema[semid].u.sem_perm.gid = cred->cr_gid; sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC; sema[semid].cred = crhold(cred); sema[semid].u.sem_perm.seq = (sema[semid].u.sem_perm.seq + 1) & 0x7fff; sema[semid].u.sem_nsems = nsems; sema[semid].u.sem_otime = 0; sema[semid].u.sem_ctime = time_second; sema[semid].u.sem_base = &sem[semtot]; semtot += nsems; bzero(sema[semid].u.sem_base, sizeof(sema[semid].u.sem_base[0])*nsems); #ifdef MAC mac_sysvsem_create(cred, &sema[semid]); #endif mtx_unlock(&sema_mtx[semid]); DPRINTF(("sembase = %p, next = %p\n", sema[semid].u.sem_base, &sem[semtot])); } else { DPRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto done2; } found: td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm); done2: mtx_unlock(&sem_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct semop_args { int semid; struct sembuf *sops; size_t nsops; }; #endif int sys_semop(struct thread *td, struct semop_args *uap) { #define SMALL_SOPS 8 struct sembuf small_sops[SMALL_SOPS]; int semid = uap->semid; size_t nsops = uap->nsops; struct prison *rpr; struct sembuf *sops; struct semid_kernel *semakptr; struct sembuf *sopptr = NULL; struct sem *semptr = NULL; struct sem_undo *suptr; struct mtx *sema_mtxp; size_t i, j, k; int error; int do_wakeup, do_undos; unsigned short seq; #ifdef SEM_DEBUG sops = NULL; #endif DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops)); AUDIT_ARG_SVIPC_ID(semid); rpr = sem_find_prison(td->td_ucred); if (sem == NULL) return (ENOSYS); semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ if (semid < 0 || semid >= seminfo.semmni) return (EINVAL); /* Allocate memory for sem_ops */ if (nsops <= SMALL_SOPS) sops = small_sops; else if (nsops > seminfo.semopm) { DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm, nsops)); return (E2BIG); } else { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) { PROC_UNLOCK(td->td_proc); return (E2BIG); } PROC_UNLOCK(td->td_proc); } #endif sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK); } if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) { DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error, uap->sops, sops, nsops * sizeof(sops[0]))); if (sops != small_sops) free(sops, M_SEM); return (error); } semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) { error = EINVAL; goto done2; } seq = semakptr->u.sem_perm.seq; if (seq != IPCID_TO_SEQ(uap->semid)) { error = EINVAL; goto done2; } if ((error = sem_prison_cansee(rpr, semakptr)) != 0) goto done2; /* * Initial pass through sops to see what permissions are needed. * Also perform any checks that don't need repeating on each * attempt to satisfy the request vector. */ j = 0; /* permission needed */ do_undos = 0; for (i = 0; i < nsops; i++) { sopptr = &sops[i]; if (sopptr->sem_num >= semakptr->u.sem_nsems) { error = EFBIG; goto done2; } if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0) do_undos = 1; j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A; } if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) { DPRINTF(("error = %d from ipaccess\n", error)); goto done2; } #ifdef MAC error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j); if (error != 0) goto done2; #endif /* * Loop trying to satisfy the vector of requests. * If we reach a point where we must wait, any requests already * performed are rolled back and we go to sleep until some other * process wakes us up. At this point, we start all over again. * * This ensures that from the perspective of other tasks, a set * of requests is atomic (never partially satisfied). */ for (;;) { do_wakeup = 0; error = 0; /* error return if necessary */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semakptr->u.sem_base[sopptr->sem_num]; DPRINTF(( "semop: semakptr=%p, sem_base=%p, " "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n", semakptr, semakptr->u.sem_base, semptr, sopptr->sem_num, semptr->semval, sopptr->sem_op, (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait")); if (sopptr->sem_op < 0) { if (semptr->semval + sopptr->sem_op < 0) { DPRINTF(("semop: can't do it now\n")); break; } else { semptr->semval += sopptr->sem_op; if (semptr->semval == 0 && semptr->semzcnt > 0) do_wakeup = 1; } } else if (sopptr->sem_op == 0) { if (semptr->semval != 0) { DPRINTF(("semop: not zero now\n")); break; } } else if (semptr->semval + sopptr->sem_op > seminfo.semvmx) { error = ERANGE; break; } else { if (semptr->semncnt > 0) do_wakeup = 1; semptr->semval += sopptr->sem_op; } } /* * Did we get through the entire vector? */ if (i >= nsops) goto done; /* * No ... rollback anything that we've already done */ DPRINTF(("semop: rollback 0 through %d\n", i-1)); for (j = 0; j < i; j++) semakptr->u.sem_base[sops[j].sem_num].semval -= sops[j].sem_op; /* If we detected an error, return it */ if (error != 0) goto done2; /* * If the request that we couldn't satisfy has the * NOWAIT flag set then return with EAGAIN. */ if (sopptr->sem_flg & IPC_NOWAIT) { error = EAGAIN; goto done2; } if (sopptr->sem_op == 0) semptr->semzcnt++; else semptr->semncnt++; DPRINTF(("semop: good night!\n")); error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH, "semwait", 0); DPRINTF(("semop: good morning (error=%d)!\n", error)); /* return code is checked below, after sem[nz]cnt-- */ /* * Make sure that the semaphore still exists */ seq = semakptr->u.sem_perm.seq; if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || seq != IPCID_TO_SEQ(uap->semid)) { error = EIDRM; goto done2; } /* * Renew the semaphore's pointer after wakeup since * during msleep sem_base may have been modified and semptr * is not valid any more */ semptr = &semakptr->u.sem_base[sopptr->sem_num]; /* * The semaphore is still alive. Readjust the count of * waiting processes. */ if (sopptr->sem_op == 0) semptr->semzcnt--; else semptr->semncnt--; /* * Is it really morning, or was our sleep interrupted? * (Delayed check of msleep() return code because we * need to decrement sem[nz]cnt either way.) */ if (error != 0) { error = EINTR; goto done2; } DPRINTF(("semop: good morning!\n")); } done: /* * Process any SEM_UNDO requests. */ if (do_undos) { SEMUNDO_LOCK(); suptr = NULL; for (i = 0; i < nsops; i++) { /* * We only need to deal with SEM_UNDO's for non-zero * op's. */ int adjval; if ((sops[i].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[i].sem_op; if (adjval == 0) continue; error = semundo_adjust(td, &suptr, semid, seq, sops[i].sem_num, -adjval); if (error == 0) continue; /* * Oh-Oh! We ran out of either sem_undo's or undo's. * Rollback the adjustments to this point and then * rollback the semaphore ups and down so we can return * with an error with all structures restored. We * rollback the undo's in the exact reverse order that * we applied them. This guarantees that we won't run * out of space as we roll things back out. */ for (j = 0; j < i; j++) { k = i - j - 1; if ((sops[k].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[k].sem_op; if (adjval == 0) continue; if (semundo_adjust(td, &suptr, semid, seq, sops[k].sem_num, adjval) != 0) panic("semop - can't undo undos"); } for (j = 0; j < nsops; j++) semakptr->u.sem_base[sops[j].sem_num].semval -= sops[j].sem_op; DPRINTF(("error = %d from semundo_adjust\n", error)); SEMUNDO_UNLOCK(); goto done2; } /* loop through the sops */ SEMUNDO_UNLOCK(); } /* if (do_undos) */ /* We're definitely done - set the sempid's and time */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semakptr->u.sem_base[sopptr->sem_num]; semptr->sempid = td->td_proc->p_pid; } semakptr->u.sem_otime = time_second; /* * Do a wakeup if any semaphore was up'd whilst something was * sleeping on it. */ if (do_wakeup) { DPRINTF(("semop: doing wakeup\n")); wakeup(semakptr); DPRINTF(("semop: back from wakeup\n")); } DPRINTF(("semop: done\n")); td->td_retval[0] = 0; done2: mtx_unlock(sema_mtxp); if (sops != small_sops) free(sops, M_SEM); return (error); } /* * Go through the undo structures for this process and apply the adjustments to * semaphores. */ static void semexit_myhook(void *arg, struct proc *p) { struct sem_undo *suptr; struct semid_kernel *semakptr; struct mtx *sema_mtxp; int semid, semnum, adjval, ix; unsigned short seq; /* * Go through the chain of undo vectors looking for one * associated with this process. */ if (LIST_EMPTY(&semu_list)) return; SEMUNDO_LOCK(); LIST_FOREACH(suptr, &semu_list, un_next) { if (suptr->un_proc == p) break; } if (suptr == NULL) { SEMUNDO_UNLOCK(); return; } LIST_REMOVE(suptr, un_next); DPRINTF(("proc @%p has undo structure with %d entries\n", p, suptr->un_cnt)); /* * If there are any active undo elements then process them. */ if (suptr->un_cnt > 0) { SEMUNDO_UNLOCK(); for (ix = 0; ix < suptr->un_cnt; ix++) { semid = suptr->un_ent[ix].un_id; semnum = suptr->un_ent[ix].un_num; adjval = suptr->un_ent[ix].un_adjval; seq = suptr->un_ent[ix].un_seq; semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || (semakptr->u.sem_perm.seq != seq)) { mtx_unlock(sema_mtxp); continue; } if (semnum >= semakptr->u.sem_nsems) panic("semexit - semnum out of range"); DPRINTF(( "semexit: %p id=%d num=%d(adj=%d) ; sem=%d\n", suptr->un_proc, suptr->un_ent[ix].un_id, suptr->un_ent[ix].un_num, suptr->un_ent[ix].un_adjval, semakptr->u.sem_base[semnum].semval)); if (adjval < 0 && semakptr->u.sem_base[semnum].semval < -adjval) semakptr->u.sem_base[semnum].semval = 0; else semakptr->u.sem_base[semnum].semval += adjval; wakeup(semakptr); DPRINTF(("semexit: back from wakeup\n")); mtx_unlock(sema_mtxp); } SEMUNDO_LOCK(); } /* * Deallocate the undo vector. */ DPRINTF(("removing vector\n")); suptr->un_proc = NULL; suptr->un_cnt = 0; LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); SEMUNDO_UNLOCK(); } static int sysctl_sema(SYSCTL_HANDLER_ARGS) { struct prison *pr, *rpr; struct semid_kernel tsemak; int error, i; pr = req->td->td_ucred->cr_prison; rpr = sem_find_prison(req->td->td_ucred); error = 0; for (i = 0; i < seminfo.semmni; i++) { mtx_lock(&sema_mtx[i]); if ((sema[i].u.sem_perm.mode & SEM_ALLOC) == 0 || rpr == NULL || sem_prison_cansee(rpr, &sema[i]) != 0) bzero(&tsemak, sizeof(tsemak)); else { tsemak = sema[i]; if (tsemak.cred->cr_prison != pr) tsemak.u.sem_perm.key = IPC_PRIVATE; } mtx_unlock(&sema_mtx[i]); error = SYSCTL_OUT(req, &tsemak, sizeof(tsemak)); if (error != 0) break; } return (error); } static int sem_prison_check(void *obj, void *data) { struct prison *pr = obj; struct prison *prpr; struct vfsoptlist *opts = data; int error, jsys; /* * sysvsem is a jailsys integer. * It must be "disable" if the parent jail is disabled. */ error = vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys)); if (error != ENOENT) { if (error != 0) return (error); switch (jsys) { case JAIL_SYS_DISABLE: break; case JAIL_SYS_NEW: case JAIL_SYS_INHERIT: prison_lock(pr->pr_parent); prpr = osd_jail_get(pr->pr_parent, sem_prison_slot); prison_unlock(pr->pr_parent); if (prpr == NULL) return (EPERM); break; default: return (EINVAL); } } return (0); } static int sem_prison_set(void *obj, void *data) { struct prison *pr = obj; struct prison *tpr, *orpr, *nrpr, *trpr; struct vfsoptlist *opts = data; void *rsv; int jsys, descend; /* * sysvsem controls which jail is the root of the associated sems (this * jail or same as the parent), or if the feature is available at all. */ if (vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys)) == ENOENT) jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0) ? JAIL_SYS_INHERIT : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0) ? JAIL_SYS_DISABLE : -1; if (jsys == JAIL_SYS_DISABLE) { prison_lock(pr); orpr = osd_jail_get(pr, sem_prison_slot); if (orpr != NULL) osd_jail_del(pr, sem_prison_slot); prison_unlock(pr); if (orpr != NULL) { if (orpr == pr) sem_prison_cleanup(pr); /* Disable all child jails as well. */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, sem_prison_slot); if (trpr != NULL) { osd_jail_del(tpr, sem_prison_slot); prison_unlock(tpr); if (trpr == tpr) sem_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } else if (jsys != -1) { if (jsys == JAIL_SYS_NEW) nrpr = pr; else { prison_lock(pr->pr_parent); nrpr = osd_jail_get(pr->pr_parent, sem_prison_slot); prison_unlock(pr->pr_parent); } rsv = osd_reserve(sem_prison_slot); prison_lock(pr); orpr = osd_jail_get(pr, sem_prison_slot); if (orpr != nrpr) (void)osd_jail_set_reserved(pr, sem_prison_slot, rsv, nrpr); else osd_free_reserved(rsv); prison_unlock(pr); if (orpr != nrpr) { if (orpr == pr) sem_prison_cleanup(pr); if (orpr != NULL) { /* Change child jails matching the old root, */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, sem_prison_slot); if (trpr == orpr) { (void)osd_jail_set(tpr, sem_prison_slot, nrpr); prison_unlock(tpr); if (trpr == tpr) sem_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } } return (0); } static int sem_prison_get(void *obj, void *data) { struct prison *pr = obj; struct prison *rpr; struct vfsoptlist *opts = data; int error, jsys; /* Set sysvsem based on the jail's root prison. */ prison_lock(pr); rpr = osd_jail_get(pr, sem_prison_slot); prison_unlock(pr); jsys = rpr == NULL ? JAIL_SYS_DISABLE : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, "sysvsem", &jsys, sizeof(jsys)); if (error == ENOENT) error = 0; return (error); } static int sem_prison_remove(void *obj, void *data __unused) { struct prison *pr = obj; struct prison *rpr; prison_lock(pr); rpr = osd_jail_get(pr, sem_prison_slot); prison_unlock(pr); if (rpr == pr) sem_prison_cleanup(pr); return (0); } static void sem_prison_cleanup(struct prison *pr) { int i; /* Remove any sems that belong to this jail. */ mtx_lock(&sem_mtx); for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && sema[i].cred != NULL && sema[i].cred->cr_prison == pr) { mtx_lock(&sema_mtx[i]); sem_remove(i, NULL); mtx_unlock(&sema_mtx[i]); } } mtx_unlock(&sem_mtx); } SYSCTL_JAIL_PARAM_SYS_NODE(sysvsem, CTLFLAG_RW, "SYSV semaphores"); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *semcalls[] = { (sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget, (sy_call_t *)sys_semop }; /* * Entry point for all SEM calls. */ int sys_semsys(td, uap) struct thread *td; /* XXX actually varargs. */ struct semsys_args /* { int which; int a2; int a3; int a4; int a5; } */ *uap; { int error; AUDIT_ARG_SVIPC_WHICH(uap->which); if (uap->which < 0 || uap->which >= nitems(semcalls)) return (EINVAL); error = (*semcalls[uap->which])(td, &uap->a2); return (error); } #ifndef CP #define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0) #endif #ifndef _SYS_SYSPROTO_H_ struct freebsd7___semctl_args { int semid; int semnum; int cmd; union semun_old *arg; }; #endif int freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap) { struct semid_ds_old dsold; struct semid_ds dsbuf; union semun_old arg; union semun semun; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(arg.buf, &dsold, sizeof(dsold)); if (error) return (error); ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm); CP(dsold, dsbuf, sem_base); CP(dsold, dsbuf, sem_nsems); CP(dsold, dsbuf, sem_otime); CP(dsold, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = arg.array; break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsold, sizeof(dsold)); ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm); CP(dsbuf, dsold, sem_base); CP(dsbuf, dsold, sem_nsems); CP(dsbuf, dsold, sem_otime); CP(dsbuf, dsold, sem_ctime); error = copyout(&dsold, arg.buf, sizeof(dsold)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif /* COMPAT_FREEBSD{4,5,6,7} */ #ifdef COMPAT_FREEBSD32 int freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap) { #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) AUDIT_ARG_SVIPC_WHICH(uap->which); switch (uap->which) { case 0: return (freebsd7_freebsd32_semctl(td, (struct freebsd7_freebsd32_semctl_args *)&uap->a2)); default: return (sys_semsys(td, (struct semsys_args *)uap)); } #else return (nosys(td, NULL)); #endif } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) int freebsd7_freebsd32_semctl(struct thread *td, struct freebsd7_freebsd32_semctl_args *uap) { struct semid_ds32_old dsbuf32; struct semid_ds dsbuf; union semun semun; union semun32 arg; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32)); if (error) return (error); freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm); PTRIN_CP(dsbuf32, dsbuf, sem_base); CP(dsbuf32, dsbuf, sem_nsems); CP(dsbuf32, dsbuf, sem_otime); CP(dsbuf32, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = PTRIN(arg.array); break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsbuf32, sizeof(dsbuf32)); freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm); PTROUT_CP(dsbuf, dsbuf32, sem_base); CP(dsbuf, dsbuf32, sem_nsems); CP(dsbuf, dsbuf32, sem_otime); CP(dsbuf, dsbuf32, sem_ctime); error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif int freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap) { struct semid_ds32 dsbuf32; struct semid_ds dsbuf; union semun semun; union semun32 arg; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32)); if (error) return (error); freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm); PTRIN_CP(dsbuf32, dsbuf, sem_base); CP(dsbuf32, dsbuf, sem_nsems); CP(dsbuf32, dsbuf, sem_otime); CP(dsbuf32, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = PTRIN(arg.array); break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsbuf32, sizeof(dsbuf32)); freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm); PTROUT_CP(dsbuf, dsbuf32, sem_base); CP(dsbuf, dsbuf32, sem_nsems); CP(dsbuf, dsbuf32, sem_otime); CP(dsbuf, dsbuf32, sem_ctime); error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif /* COMPAT_FREEBSD32 */ Index: head/sys/kern/sysv_shm.c =================================================================== --- head/sys/kern/sysv_shm.c (revision 326270) +++ head/sys/kern/sysv_shm.c (revision 326271) @@ -1,1663 +1,1665 @@ /* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */ /*- + * SPDX-License-Identifier: BSD-4-Clause AND BSD-2-Clause-FreeBSD + * * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Adam Glass and Charles * Hannum. * 4. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2003-2005 McAfee, Inc. * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(sysv_shm, "System V shared memory segments support"); static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments"); static int shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode); static int shmget_existing(struct thread *td, struct shmget_args *uap, int mode, int segnum); #define SHMSEG_FREE 0x0200 #define SHMSEG_REMOVED 0x0400 #define SHMSEG_ALLOCATED 0x0800 static int shm_last_free, shm_nused, shmalloced; vm_size_t shm_committed; static struct shmid_kernel *shmsegs; static unsigned shm_prison_slot; struct shmmap_state { vm_offset_t va; int shmid; }; static void shm_deallocate_segment(struct shmid_kernel *); static int shm_find_segment_by_key(struct prison *, key_t); static struct shmid_kernel *shm_find_segment(struct prison *, int, bool); static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *); static void shmrealloc(void); static int shminit(void); static int sysvshm_modload(struct module *, int, void *); static int shmunload(void); static void shmexit_myhook(struct vmspace *vm); static void shmfork_myhook(struct proc *p1, struct proc *p2); static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS); static void shm_remove(struct shmid_kernel *, int); static struct prison *shm_find_prison(struct ucred *); static int shm_prison_cansee(struct prison *, struct shmid_kernel *); static int shm_prison_check(void *, void *); static int shm_prison_set(void *, void *); static int shm_prison_get(void *, void *); static int shm_prison_remove(void *, void *); static void shm_prison_cleanup(struct prison *); /* * Tuneable values. */ #ifndef SHMMAXPGS #define SHMMAXPGS 131072 /* Note: sysv shared memory is swap backed. */ #endif #ifndef SHMMAX #define SHMMAX (SHMMAXPGS*PAGE_SIZE) #endif #ifndef SHMMIN #define SHMMIN 1 #endif #ifndef SHMMNI #define SHMMNI 192 #endif #ifndef SHMSEG #define SHMSEG 128 #endif #ifndef SHMALL #define SHMALL (SHMMAXPGS) #endif struct shminfo shminfo = { .shmmax = SHMMAX, .shmmin = SHMMIN, .shmmni = SHMMNI, .shmseg = SHMSEG, .shmall = SHMALL }; static int shm_use_phys; static int shm_allow_removed = 1; SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RWTUN, &shminfo.shmmax, 0, "Maximum shared memory segment size"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RWTUN, &shminfo.shmmin, 0, "Minimum shared memory segment size"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0, "Number of shared memory identifiers"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0, "Number of segments per process"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RWTUN, &shminfo.shmall, 0, "Maximum number of pages available for shared memory"); SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RWTUN, &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core"); SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RWTUN, &shm_allow_removed, 0, "Enable/Disable attachment to attached segments marked for removal"); SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_shmsegs, "", "Current number of shared memory segments allocated"); static struct sx sysvshmsx; #define SYSVSHM_LOCK() sx_xlock(&sysvshmsx) #define SYSVSHM_UNLOCK() sx_xunlock(&sysvshmsx) #define SYSVSHM_ASSERT_LOCKED() sx_assert(&sysvshmsx, SA_XLOCKED) static int shm_find_segment_by_key(struct prison *pr, key_t key) { int i; for (i = 0; i < shmalloced; i++) if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) && shmsegs[i].cred != NULL && shmsegs[i].cred->cr_prison == pr && shmsegs[i].u.shm_perm.key == key) return (i); return (-1); } /* * Finds segment either by shmid if is_shmid is true, or by segnum if * is_shmid is false. */ static struct shmid_kernel * shm_find_segment(struct prison *rpr, int arg, bool is_shmid) { struct shmid_kernel *shmseg; int segnum; segnum = is_shmid ? IPCID_TO_IX(arg) : arg; if (segnum < 0 || segnum >= shmalloced) return (NULL); shmseg = &shmsegs[segnum]; if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 || (!shm_allow_removed && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) || (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)) || shm_prison_cansee(rpr, shmseg) != 0) return (NULL); return (shmseg); } static void shm_deallocate_segment(struct shmid_kernel *shmseg) { vm_size_t size; SYSVSHM_ASSERT_LOCKED(); vm_object_deallocate(shmseg->object); shmseg->object = NULL; size = round_page(shmseg->u.shm_segsz); shm_committed -= btoc(size); shm_nused--; shmseg->u.shm_perm.mode = SHMSEG_FREE; #ifdef MAC mac_sysvshm_cleanup(shmseg); #endif racct_sub_cred(shmseg->cred, RACCT_NSHM, 1); racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size); crfree(shmseg->cred); shmseg->cred = NULL; } static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s) { struct shmid_kernel *shmseg; int segnum, result; vm_size_t size; SYSVSHM_ASSERT_LOCKED(); segnum = IPCID_TO_IX(shmmap_s->shmid); KASSERT(segnum >= 0 && segnum < shmalloced, ("segnum %d shmalloced %d", segnum, shmalloced)); shmseg = &shmsegs[segnum]; size = round_page(shmseg->u.shm_segsz); result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size); if (result != KERN_SUCCESS) return (EINVAL); shmmap_s->shmid = -1; shmseg->u.shm_dtime = time_second; if (--shmseg->u.shm_nattch == 0 && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) { shm_deallocate_segment(shmseg); shm_last_free = segnum; } return (0); } static void shm_remove(struct shmid_kernel *shmseg, int segnum) { shmseg->u.shm_perm.key = IPC_PRIVATE; shmseg->u.shm_perm.mode |= SHMSEG_REMOVED; if (shmseg->u.shm_nattch == 0) { shm_deallocate_segment(shmseg); shm_last_free = segnum; } } static struct prison * shm_find_prison(struct ucred *cred) { struct prison *pr, *rpr; pr = cred->cr_prison; prison_lock(pr); rpr = osd_jail_get(pr, shm_prison_slot); prison_unlock(pr); return rpr; } static int shm_prison_cansee(struct prison *rpr, struct shmid_kernel *shmseg) { if (shmseg->cred == NULL || !(rpr == shmseg->cred->cr_prison || prison_ischild(rpr, shmseg->cred->cr_prison))) return (EINVAL); return (0); } static int kern_shmdt_locked(struct thread *td, const void *shmaddr) { struct proc *p = td->td_proc; struct shmmap_state *shmmap_s; #if defined(AUDIT) || defined(MAC) struct shmid_kernel *shmsegptr; #endif #ifdef MAC int error; #endif int i; SYSVSHM_ASSERT_LOCKED(); if (shm_find_prison(td->td_ucred) == NULL) return (ENOSYS); shmmap_s = p->p_vmspace->vm_shm; if (shmmap_s == NULL) return (EINVAL); AUDIT_ARG_SVIPC_ID(shmmap_s->shmid); for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) { if (shmmap_s->shmid != -1 && shmmap_s->va == (vm_offset_t)shmaddr) { break; } } if (i == shminfo.shmseg) return (EINVAL); #if (defined(AUDIT) && defined(KDTRACE_HOOKS)) || defined(MAC) shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)]; #endif #ifdef MAC error = mac_sysvshm_check_shmdt(td->td_ucred, shmsegptr); if (error != 0) return (error); #endif return (shm_delete_mapping(p->p_vmspace, shmmap_s)); } #ifndef _SYS_SYSPROTO_H_ struct shmdt_args { const void *shmaddr; }; #endif int sys_shmdt(struct thread *td, struct shmdt_args *uap) { int error; SYSVSHM_LOCK(); error = kern_shmdt_locked(td, uap->shmaddr); SYSVSHM_UNLOCK(); return (error); } static int kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr, int shmflg) { struct prison *rpr; struct proc *p = td->td_proc; struct shmid_kernel *shmseg; struct shmmap_state *shmmap_s; vm_offset_t attach_va; vm_prot_t prot; vm_size_t size; int error, i, rv; AUDIT_ARG_SVIPC_ID(shmid); AUDIT_ARG_VALUE(shmflg); SYSVSHM_ASSERT_LOCKED(); rpr = shm_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); shmmap_s = p->p_vmspace->vm_shm; if (shmmap_s == NULL) { shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state), M_SHM, M_WAITOK); for (i = 0; i < shminfo.shmseg; i++) shmmap_s[i].shmid = -1; KASSERT(p->p_vmspace->vm_shm == NULL, ("raced")); p->p_vmspace->vm_shm = shmmap_s; } shmseg = shm_find_segment(rpr, shmid, true); if (shmseg == NULL) return (EINVAL); error = ipcperm(td, &shmseg->u.shm_perm, (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); if (error != 0) return (error); #ifdef MAC error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg); if (error != 0) return (error); #endif for (i = 0; i < shminfo.shmseg; i++) { if (shmmap_s->shmid == -1) break; shmmap_s++; } if (i >= shminfo.shmseg) return (EMFILE); size = round_page(shmseg->u.shm_segsz); prot = VM_PROT_READ; if ((shmflg & SHM_RDONLY) == 0) prot |= VM_PROT_WRITE; if (shmaddr != NULL) { if ((shmflg & SHM_RND) != 0) attach_va = rounddown2((vm_offset_t)shmaddr, SHMLBA); else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) attach_va = (vm_offset_t)shmaddr; else return (EINVAL); } else { /* * This is just a hint to vm_map_find() about where to * put it. */ attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(td, RLIMIT_DATA)); } vm_object_reference(shmseg->object); rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object, 0, &attach_va, size, 0, shmaddr != NULL ? VMFS_NO_SPACE : VMFS_OPTIMAL_SPACE, prot, prot, MAP_INHERIT_SHARE | MAP_PREFAULT_PARTIAL); if (rv != KERN_SUCCESS) { vm_object_deallocate(shmseg->object); return (ENOMEM); } shmmap_s->va = attach_va; shmmap_s->shmid = shmid; shmseg->u.shm_lpid = p->p_pid; shmseg->u.shm_atime = time_second; shmseg->u.shm_nattch++; td->td_retval[0] = attach_va; return (error); } int kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg) { int error; SYSVSHM_LOCK(); error = kern_shmat_locked(td, shmid, shmaddr, shmflg); SYSVSHM_UNLOCK(); return (error); } #ifndef _SYS_SYSPROTO_H_ struct shmat_args { int shmid; const void *shmaddr; int shmflg; }; #endif int sys_shmat(struct thread *td, struct shmat_args *uap) { return (kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg)); } static int kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz) { struct prison *rpr; struct shmid_kernel *shmseg; struct shmid_ds *shmidp; struct shm_info shm_info; int error; SYSVSHM_ASSERT_LOCKED(); rpr = shm_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); AUDIT_ARG_SVIPC_ID(shmid); AUDIT_ARG_SVIPC_CMD(cmd); switch (cmd) { /* * It is possible that kern_shmctl is being called from the Linux ABI * layer, in which case, we will need to implement IPC_INFO. It should * be noted that other shmctl calls will be funneled through here for * Linix binaries as well. * * NB: The Linux ABI layer will convert this data to structure(s) more * consistent with the Linux ABI. */ case IPC_INFO: memcpy(buf, &shminfo, sizeof(shminfo)); if (bufsz) *bufsz = sizeof(shminfo); td->td_retval[0] = shmalloced; return (0); case SHM_INFO: { shm_info.used_ids = shm_nused; shm_info.shm_rss = 0; /*XXX where to get from ? */ shm_info.shm_tot = 0; /*XXX where to get from ? */ shm_info.shm_swp = 0; /*XXX where to get from ? */ shm_info.swap_attempts = 0; /*XXX where to get from ? */ shm_info.swap_successes = 0; /*XXX where to get from ? */ memcpy(buf, &shm_info, sizeof(shm_info)); if (bufsz != NULL) *bufsz = sizeof(shm_info); td->td_retval[0] = shmalloced; return (0); } } shmseg = shm_find_segment(rpr, shmid, cmd != SHM_STAT); if (shmseg == NULL) return (EINVAL); #ifdef MAC error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd); if (error != 0) return (error); #endif switch (cmd) { case SHM_STAT: case IPC_STAT: shmidp = (struct shmid_ds *)buf; error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); if (error != 0) return (error); memcpy(shmidp, &shmseg->u, sizeof(struct shmid_ds)); if (td->td_ucred->cr_prison != shmseg->cred->cr_prison) shmidp->shm_perm.key = IPC_PRIVATE; if (bufsz != NULL) *bufsz = sizeof(struct shmid_ds); if (cmd == SHM_STAT) { td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm); } break; case IPC_SET: shmidp = (struct shmid_ds *)buf; AUDIT_ARG_SVIPC_PERM(&shmidp->shm_perm); error = ipcperm(td, &shmseg->u.shm_perm, IPC_M); if (error != 0) return (error); shmseg->u.shm_perm.uid = shmidp->shm_perm.uid; shmseg->u.shm_perm.gid = shmidp->shm_perm.gid; shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & ~ACCESSPERMS) | (shmidp->shm_perm.mode & ACCESSPERMS); shmseg->u.shm_ctime = time_second; break; case IPC_RMID: error = ipcperm(td, &shmseg->u.shm_perm, IPC_M); if (error != 0) return (error); shm_remove(shmseg, IPCID_TO_IX(shmid)); break; #if 0 case SHM_LOCK: case SHM_UNLOCK: #endif default: error = EINVAL; break; } return (error); } int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz) { int error; SYSVSHM_LOCK(); error = kern_shmctl_locked(td, shmid, cmd, buf, bufsz); SYSVSHM_UNLOCK(); return (error); } #ifndef _SYS_SYSPROTO_H_ struct shmctl_args { int shmid; int cmd; struct shmid_ds *buf; }; #endif int sys_shmctl(struct thread *td, struct shmctl_args *uap) { int error; struct shmid_ds buf; size_t bufsz; /* * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support * Linux binaries. If we see the call come through the FreeBSD ABI, * return an error back to the user since we do not to support this. */ if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO || uap->cmd == SHM_STAT) return (EINVAL); /* IPC_SET needs to copyin the buffer before calling kern_shmctl */ if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds)))) goto done; } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_STAT: error = copyout(&buf, uap->buf, bufsz); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } static int shmget_existing(struct thread *td, struct shmget_args *uap, int mode, int segnum) { struct shmid_kernel *shmseg; #ifdef MAC int error; #endif SYSVSHM_ASSERT_LOCKED(); KASSERT(segnum >= 0 && segnum < shmalloced, ("segnum %d shmalloced %d", segnum, shmalloced)); shmseg = &shmsegs[segnum]; if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL)) return (EEXIST); #ifdef MAC error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg); if (error != 0) return (error); #endif if (uap->size != 0 && uap->size > shmseg->u.shm_segsz) return (EINVAL); td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm); return (0); } static int shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode) { struct ucred *cred = td->td_ucred; struct shmid_kernel *shmseg; vm_object_t shm_object; int i, segnum; size_t size; SYSVSHM_ASSERT_LOCKED(); if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax) return (EINVAL); if (shm_nused >= shminfo.shmmni) /* Any shmids left? */ return (ENOSPC); size = round_page(uap->size); if (shm_committed + btoc(size) > shminfo.shmall) return (ENOMEM); if (shm_last_free < 0) { shmrealloc(); /* Maybe expand the shmsegs[] array. */ for (i = 0; i < shmalloced; i++) if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE) break; if (i == shmalloced) return (ENOSPC); segnum = i; } else { segnum = shm_last_free; shm_last_free = -1; } KASSERT(segnum >= 0 && segnum < shmalloced, ("segnum %d shmalloced %d", segnum, shmalloced)); shmseg = &shmsegs[segnum]; #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); if (racct_add(td->td_proc, RACCT_NSHM, 1)) { PROC_UNLOCK(td->td_proc); return (ENOSPC); } if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) { racct_sub(td->td_proc, RACCT_NSHM, 1); PROC_UNLOCK(td->td_proc); return (ENOMEM); } PROC_UNLOCK(td->td_proc); } #endif /* * We make sure that we have allocated a pager before we need * to. */ shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0, cred); if (shm_object == NULL) { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_sub(td->td_proc, RACCT_NSHM, 1); racct_sub(td->td_proc, RACCT_SHMSIZE, size); PROC_UNLOCK(td->td_proc); } #endif return (ENOMEM); } shm_object->pg_color = 0; VM_OBJECT_WLOCK(shm_object); vm_object_clear_flag(shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shm_object, OBJ_COLORED | OBJ_NOSPLIT); VM_OBJECT_WUNLOCK(shm_object); shmseg->object = shm_object; shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid; shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid; shmseg->u.shm_perm.mode = (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; shmseg->u.shm_perm.key = uap->key; shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff; shmseg->cred = crhold(cred); shmseg->u.shm_segsz = uap->size; shmseg->u.shm_cpid = td->td_proc->p_pid; shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0; shmseg->u.shm_atime = shmseg->u.shm_dtime = 0; #ifdef MAC mac_sysvshm_create(cred, shmseg); #endif shmseg->u.shm_ctime = time_second; shm_committed += btoc(size); shm_nused++; td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm); return (0); } #ifndef _SYS_SYSPROTO_H_ struct shmget_args { key_t key; size_t size; int shmflg; }; #endif int sys_shmget(struct thread *td, struct shmget_args *uap) { int segnum, mode; int error; if (shm_find_prison(td->td_ucred) == NULL) return (ENOSYS); mode = uap->shmflg & ACCESSPERMS; SYSVSHM_LOCK(); if (uap->key == IPC_PRIVATE) { error = shmget_allocate_segment(td, uap, mode); } else { segnum = shm_find_segment_by_key(td->td_ucred->cr_prison, uap->key); if (segnum >= 0) error = shmget_existing(td, uap, mode, segnum); else if ((uap->shmflg & IPC_CREAT) == 0) error = ENOENT; else error = shmget_allocate_segment(td, uap, mode); } SYSVSHM_UNLOCK(); return (error); } static void shmfork_myhook(struct proc *p1, struct proc *p2) { struct shmmap_state *shmmap_s; size_t size; int i; SYSVSHM_LOCK(); size = shminfo.shmseg * sizeof(struct shmmap_state); shmmap_s = malloc(size, M_SHM, M_WAITOK); bcopy(p1->p_vmspace->vm_shm, shmmap_s, size); p2->p_vmspace->vm_shm = shmmap_s; for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) { if (shmmap_s->shmid != -1) { KASSERT(IPCID_TO_IX(shmmap_s->shmid) >= 0 && IPCID_TO_IX(shmmap_s->shmid) < shmalloced, ("segnum %d shmalloced %d", IPCID_TO_IX(shmmap_s->shmid), shmalloced)); shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++; } } SYSVSHM_UNLOCK(); } static void shmexit_myhook(struct vmspace *vm) { struct shmmap_state *base, *shm; int i; base = vm->vm_shm; if (base != NULL) { vm->vm_shm = NULL; SYSVSHM_LOCK(); for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) { if (shm->shmid != -1) shm_delete_mapping(vm, shm); } SYSVSHM_UNLOCK(); free(base, M_SHM); } } static void shmrealloc(void) { struct shmid_kernel *newsegs; int i; SYSVSHM_ASSERT_LOCKED(); if (shmalloced >= shminfo.shmmni) return; newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK); for (i = 0; i < shmalloced; i++) bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0])); for (; i < shminfo.shmmni; i++) { newsegs[i].u.shm_perm.mode = SHMSEG_FREE; newsegs[i].u.shm_perm.seq = 0; #ifdef MAC mac_sysvshm_init(&newsegs[i]); #endif } free(shmsegs, M_SHM); shmsegs = newsegs; shmalloced = shminfo.shmmni; } static struct syscall_helper_data shm_syscalls[] = { SYSCALL_INIT_HELPER(shmat), SYSCALL_INIT_HELPER(shmctl), SYSCALL_INIT_HELPER(shmdt), SYSCALL_INIT_HELPER(shmget), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl), #endif #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) SYSCALL_INIT_HELPER(shmsys), #endif SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include static struct syscall_helper_data shm32_syscalls[] = { SYSCALL32_INIT_HELPER_COMPAT(shmat), SYSCALL32_INIT_HELPER_COMPAT(shmdt), SYSCALL32_INIT_HELPER_COMPAT(shmget), SYSCALL32_INIT_HELPER(freebsd32_shmsys), SYSCALL32_INIT_HELPER(freebsd32_shmctl), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl), #endif SYSCALL_INIT_LAST }; #endif static int shminit(void) { struct prison *pr; void **rsv; int i, error; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_CHECK] = shm_prison_check, [PR_METHOD_SET] = shm_prison_set, [PR_METHOD_GET] = shm_prison_get, [PR_METHOD_REMOVE] = shm_prison_remove, }; #ifndef BURN_BRIDGES if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0) printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n"); #endif if (shminfo.shmmax == SHMMAX) { /* Initialize shmmax dealing with possible overflow. */ for (i = PAGE_SIZE; i != 0; i--) { shminfo.shmmax = shminfo.shmall * i; if ((shminfo.shmmax / shminfo.shmall) == (u_long)i) break; } } shmalloced = shminfo.shmmni; shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK); for (i = 0; i < shmalloced; i++) { shmsegs[i].u.shm_perm.mode = SHMSEG_FREE; shmsegs[i].u.shm_perm.seq = 0; #ifdef MAC mac_sysvshm_init(&shmsegs[i]); #endif } shm_last_free = 0; shm_nused = 0; shm_committed = 0; sx_init(&sysvshmsx, "sysvshmsx"); shmexit_hook = &shmexit_myhook; shmfork_hook = &shmfork_myhook; /* Set current prisons according to their allow.sysvipc. */ shm_prison_slot = osd_jail_register(NULL, methods); rsv = osd_reserve(shm_prison_slot); prison_lock(&prison0); (void)osd_jail_set_reserved(&prison0, shm_prison_slot, rsv, &prison0); prison_unlock(&prison0); rsv = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { if (rsv == NULL) rsv = osd_reserve(shm_prison_slot); prison_lock(pr); if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) { (void)osd_jail_set_reserved(pr, shm_prison_slot, rsv, &prison0); rsv = NULL; } prison_unlock(pr); } if (rsv != NULL) osd_free_reserved(rsv); sx_sunlock(&allprison_lock); error = syscall_helper_register(shm_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(shm32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int shmunload(void) { int i; if (shm_nused > 0) return (EBUSY); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(shm32_syscalls); #endif syscall_helper_unregister(shm_syscalls); if (shm_prison_slot != 0) osd_jail_deregister(shm_prison_slot); for (i = 0; i < shmalloced; i++) { #ifdef MAC mac_sysvshm_destroy(&shmsegs[i]); #endif /* * Objects might be still mapped into the processes * address spaces. Actual free would happen on the * last mapping destruction. */ if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE) vm_object_deallocate(shmsegs[i].object); } free(shmsegs, M_SHM); shmexit_hook = NULL; shmfork_hook = NULL; sx_destroy(&sysvshmsx); return (0); } static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS) { struct shmid_kernel tshmseg; struct prison *pr, *rpr; int error, i; SYSVSHM_LOCK(); pr = req->td->td_ucred->cr_prison; rpr = shm_find_prison(req->td->td_ucred); error = 0; for (i = 0; i < shmalloced; i++) { if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 || rpr == NULL || shm_prison_cansee(rpr, &shmsegs[i]) != 0) { bzero(&tshmseg, sizeof(tshmseg)); tshmseg.u.shm_perm.mode = SHMSEG_FREE; } else { tshmseg = shmsegs[i]; if (tshmseg.cred->cr_prison != pr) tshmseg.u.shm_perm.key = IPC_PRIVATE; } error = SYSCTL_OUT(req, &tshmseg, sizeof(tshmseg)); if (error != 0) break; } SYSVSHM_UNLOCK(); return (error); } static int shm_prison_check(void *obj, void *data) { struct prison *pr = obj; struct prison *prpr; struct vfsoptlist *opts = data; int error, jsys; /* * sysvshm is a jailsys integer. * It must be "disable" if the parent jail is disabled. */ error = vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)); if (error != ENOENT) { if (error != 0) return (error); switch (jsys) { case JAIL_SYS_DISABLE: break; case JAIL_SYS_NEW: case JAIL_SYS_INHERIT: prison_lock(pr->pr_parent); prpr = osd_jail_get(pr->pr_parent, shm_prison_slot); prison_unlock(pr->pr_parent); if (prpr == NULL) return (EPERM); break; default: return (EINVAL); } } return (0); } static int shm_prison_set(void *obj, void *data) { struct prison *pr = obj; struct prison *tpr, *orpr, *nrpr, *trpr; struct vfsoptlist *opts = data; void *rsv; int jsys, descend; /* * sysvshm controls which jail is the root of the associated segments * (this jail or same as the parent), or if the feature is available * at all. */ if (vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)) == ENOENT) jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0) ? JAIL_SYS_INHERIT : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0) ? JAIL_SYS_DISABLE : -1; if (jsys == JAIL_SYS_DISABLE) { prison_lock(pr); orpr = osd_jail_get(pr, shm_prison_slot); if (orpr != NULL) osd_jail_del(pr, shm_prison_slot); prison_unlock(pr); if (orpr != NULL) { if (orpr == pr) shm_prison_cleanup(pr); /* Disable all child jails as well. */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, shm_prison_slot); if (trpr != NULL) { osd_jail_del(tpr, shm_prison_slot); prison_unlock(tpr); if (trpr == tpr) shm_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } else if (jsys != -1) { if (jsys == JAIL_SYS_NEW) nrpr = pr; else { prison_lock(pr->pr_parent); nrpr = osd_jail_get(pr->pr_parent, shm_prison_slot); prison_unlock(pr->pr_parent); } rsv = osd_reserve(shm_prison_slot); prison_lock(pr); orpr = osd_jail_get(pr, shm_prison_slot); if (orpr != nrpr) (void)osd_jail_set_reserved(pr, shm_prison_slot, rsv, nrpr); else osd_free_reserved(rsv); prison_unlock(pr); if (orpr != nrpr) { if (orpr == pr) shm_prison_cleanup(pr); if (orpr != NULL) { /* Change child jails matching the old root, */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, shm_prison_slot); if (trpr == orpr) { (void)osd_jail_set(tpr, shm_prison_slot, nrpr); prison_unlock(tpr); if (trpr == tpr) shm_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } } return (0); } static int shm_prison_get(void *obj, void *data) { struct prison *pr = obj; struct prison *rpr; struct vfsoptlist *opts = data; int error, jsys; /* Set sysvshm based on the jail's root prison. */ prison_lock(pr); rpr = osd_jail_get(pr, shm_prison_slot); prison_unlock(pr); jsys = rpr == NULL ? JAIL_SYS_DISABLE : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, "sysvshm", &jsys, sizeof(jsys)); if (error == ENOENT) error = 0; return (error); } static int shm_prison_remove(void *obj, void *data __unused) { struct prison *pr = obj; struct prison *rpr; SYSVSHM_LOCK(); prison_lock(pr); rpr = osd_jail_get(pr, shm_prison_slot); prison_unlock(pr); if (rpr == pr) shm_prison_cleanup(pr); SYSVSHM_UNLOCK(); return (0); } static void shm_prison_cleanup(struct prison *pr) { struct shmid_kernel *shmseg; int i; /* Remove any segments that belong to this jail. */ for (i = 0; i < shmalloced; i++) { shmseg = &shmsegs[i]; if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) && shmseg->cred != NULL && shmseg->cred->cr_prison == pr) { shm_remove(shmseg, i); } } } SYSCTL_JAIL_PARAM_SYS_NODE(sysvshm, CTLFLAG_RW, "SYSV shared memory"); #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) struct oshmid_ds { struct ipc_perm_old shm_perm; /* operation perms */ int shm_segsz; /* size of segment (bytes) */ u_short shm_cpid; /* pid, creator */ u_short shm_lpid; /* pid, last operation */ short shm_nattch; /* no. of current attaches */ time_t shm_atime; /* last attach time */ time_t shm_dtime; /* last detach time */ time_t shm_ctime; /* last change time */ void *shm_handle; /* internal handle for shm segment */ }; struct oshmctl_args { int shmid; int cmd; struct oshmid_ds *ubuf; }; static int oshmctl(struct thread *td, struct oshmctl_args *uap) { #ifdef COMPAT_43 int error = 0; struct prison *rpr; struct shmid_kernel *shmseg; struct oshmid_ds outbuf; rpr = shm_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); if (uap->cmd != IPC_STAT) { return (freebsd7_shmctl(td, (struct freebsd7_shmctl_args *)uap)); } SYSVSHM_LOCK(); shmseg = shm_find_segment(rpr, uap->shmid, true); if (shmseg == NULL) { SYSVSHM_UNLOCK(); return (EINVAL); } error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); if (error != 0) { SYSVSHM_UNLOCK(); return (error); } #ifdef MAC error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd); if (error != 0) { SYSVSHM_UNLOCK(); return (error); } #endif ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm); outbuf.shm_segsz = shmseg->u.shm_segsz; outbuf.shm_cpid = shmseg->u.shm_cpid; outbuf.shm_lpid = shmseg->u.shm_lpid; outbuf.shm_nattch = shmseg->u.shm_nattch; outbuf.shm_atime = shmseg->u.shm_atime; outbuf.shm_dtime = shmseg->u.shm_dtime; outbuf.shm_ctime = shmseg->u.shm_ctime; outbuf.shm_handle = shmseg->object; SYSVSHM_UNLOCK(); return (copyout(&outbuf, uap->ubuf, sizeof(outbuf))); #else return (EINVAL); #endif } /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *shmcalls[] = { (sy_call_t *)sys_shmat, (sy_call_t *)oshmctl, (sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget, (sy_call_t *)freebsd7_shmctl }; #ifndef _SYS_SYSPROTO_H_ /* XXX actually varargs. */ struct shmsys_args { int which; int a2; int a3; int a4; }; #endif int sys_shmsys(struct thread *td, struct shmsys_args *uap) { AUDIT_ARG_SVIPC_WHICH(uap->which); if (uap->which < 0 || uap->which >= nitems(shmcalls)) return (EINVAL); return ((*shmcalls[uap->which])(td, &uap->a2)); } #endif /* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */ #ifdef COMPAT_FREEBSD32 int freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap) { #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) AUDIT_ARG_SVIPC_WHICH(uap->which); switch (uap->which) { case 0: { /* shmat */ struct shmat_args ap; ap.shmid = uap->a2; ap.shmaddr = PTRIN(uap->a3); ap.shmflg = uap->a4; return (sysent[SYS_shmat].sy_call(td, &ap)); } case 2: { /* shmdt */ struct shmdt_args ap; ap.shmaddr = PTRIN(uap->a2); return (sysent[SYS_shmdt].sy_call(td, &ap)); } case 3: { /* shmget */ struct shmget_args ap; ap.key = uap->a2; ap.size = uap->a3; ap.shmflg = uap->a4; return (sysent[SYS_shmget].sy_call(td, &ap)); } case 4: { /* shmctl */ struct freebsd7_freebsd32_shmctl_args ap; ap.shmid = uap->a2; ap.cmd = uap->a3; ap.buf = PTRIN(uap->a4); return (freebsd7_freebsd32_shmctl(td, &ap)); } case 1: /* oshmctl */ default: return (EINVAL); } #else return (nosys(td, NULL)); #endif } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) int freebsd7_freebsd32_shmctl(struct thread *td, struct freebsd7_freebsd32_shmctl_args *uap) { int error; union { struct shmid_ds shmid_ds; struct shm_info shm_info; struct shminfo shminfo; } u; union { struct shmid_ds32_old shmid_ds32; struct shm_info32 shm_info32; struct shminfo32 shminfo32; } u32; size_t sz; if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &u32.shmid_ds32, sizeof(u32.shmid_ds32)))) goto done; freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm, &u.shmid_ds.shm_perm); CP(u32.shmid_ds32, u.shmid_ds, shm_segsz); CP(u32.shmid_ds32, u.shmid_ds, shm_lpid); CP(u32.shmid_ds32, u.shmid_ds, shm_cpid); CP(u32.shmid_ds32, u.shmid_ds, shm_nattch); CP(u32.shmid_ds32, u.shmid_ds, shm_atime); CP(u32.shmid_ds32, u.shmid_ds, shm_dtime); CP(u32.shmid_ds32, u.shmid_ds, shm_ctime); } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_INFO: CP(u.shminfo, u32.shminfo32, shmmax); CP(u.shminfo, u32.shminfo32, shmmin); CP(u.shminfo, u32.shminfo32, shmmni); CP(u.shminfo, u32.shminfo32, shmseg); CP(u.shminfo, u32.shminfo32, shmall); error = copyout(&u32.shminfo32, uap->buf, sizeof(u32.shminfo32)); break; case SHM_INFO: CP(u.shm_info, u32.shm_info32, used_ids); CP(u.shm_info, u32.shm_info32, shm_rss); CP(u.shm_info, u32.shm_info32, shm_tot); CP(u.shm_info, u32.shm_info32, shm_swp); CP(u.shm_info, u32.shm_info32, swap_attempts); CP(u.shm_info, u32.shm_info32, swap_successes); error = copyout(&u32.shm_info32, uap->buf, sizeof(u32.shm_info32)); break; case SHM_STAT: case IPC_STAT: freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm, &u32.shmid_ds32.shm_perm); if (u.shmid_ds.shm_segsz > INT32_MAX) u32.shmid_ds32.shm_segsz = INT32_MAX; else CP(u.shmid_ds, u32.shmid_ds32, shm_segsz); CP(u.shmid_ds, u32.shmid_ds32, shm_lpid); CP(u.shmid_ds, u32.shmid_ds32, shm_cpid); CP(u.shmid_ds, u32.shmid_ds32, shm_nattch); CP(u.shmid_ds, u32.shmid_ds32, shm_atime); CP(u.shmid_ds, u32.shmid_ds32, shm_dtime); CP(u.shmid_ds, u32.shmid_ds32, shm_ctime); u32.shmid_ds32.shm_internal = 0; error = copyout(&u32.shmid_ds32, uap->buf, sizeof(u32.shmid_ds32)); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } #endif int freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap) { int error; union { struct shmid_ds shmid_ds; struct shm_info shm_info; struct shminfo shminfo; } u; union { struct shmid_ds32 shmid_ds32; struct shm_info32 shm_info32; struct shminfo32 shminfo32; } u32; size_t sz; if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &u32.shmid_ds32, sizeof(u32.shmid_ds32)))) goto done; freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm, &u.shmid_ds.shm_perm); CP(u32.shmid_ds32, u.shmid_ds, shm_segsz); CP(u32.shmid_ds32, u.shmid_ds, shm_lpid); CP(u32.shmid_ds32, u.shmid_ds, shm_cpid); CP(u32.shmid_ds32, u.shmid_ds, shm_nattch); CP(u32.shmid_ds32, u.shmid_ds, shm_atime); CP(u32.shmid_ds32, u.shmid_ds, shm_dtime); CP(u32.shmid_ds32, u.shmid_ds, shm_ctime); } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_INFO: CP(u.shminfo, u32.shminfo32, shmmax); CP(u.shminfo, u32.shminfo32, shmmin); CP(u.shminfo, u32.shminfo32, shmmni); CP(u.shminfo, u32.shminfo32, shmseg); CP(u.shminfo, u32.shminfo32, shmall); error = copyout(&u32.shminfo32, uap->buf, sizeof(u32.shminfo32)); break; case SHM_INFO: CP(u.shm_info, u32.shm_info32, used_ids); CP(u.shm_info, u32.shm_info32, shm_rss); CP(u.shm_info, u32.shm_info32, shm_tot); CP(u.shm_info, u32.shm_info32, shm_swp); CP(u.shm_info, u32.shm_info32, swap_attempts); CP(u.shm_info, u32.shm_info32, swap_successes); error = copyout(&u32.shm_info32, uap->buf, sizeof(u32.shm_info32)); break; case SHM_STAT: case IPC_STAT: freebsd32_ipcperm_out(&u.shmid_ds.shm_perm, &u32.shmid_ds32.shm_perm); if (u.shmid_ds.shm_segsz > INT32_MAX) u32.shmid_ds32.shm_segsz = INT32_MAX; else CP(u.shmid_ds, u32.shmid_ds32, shm_segsz); CP(u.shmid_ds, u32.shmid_ds32, shm_lpid); CP(u.shmid_ds, u32.shmid_ds32, shm_cpid); CP(u.shmid_ds, u32.shmid_ds32, shm_nattch); CP(u.shmid_ds, u32.shmid_ds32, shm_atime); CP(u.shmid_ds, u32.shmid_ds32, shm_dtime); CP(u.shmid_ds, u32.shmid_ds32, shm_ctime); error = copyout(&u32.shmid_ds32, uap->buf, sizeof(u32.shmid_ds32)); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } #endif #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) #ifndef CP #define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0) #endif #ifndef _SYS_SYSPROTO_H_ struct freebsd7_shmctl_args { int shmid; int cmd; struct shmid_ds_old *buf; }; #endif int freebsd7_shmctl(struct thread *td, struct freebsd7_shmctl_args *uap) { int error; struct shmid_ds_old old; struct shmid_ds buf; size_t bufsz; /* * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support * Linux binaries. If we see the call come through the FreeBSD ABI, * return an error back to the user since we do not to support this. */ if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO || uap->cmd == SHM_STAT) return (EINVAL); /* IPC_SET needs to copyin the buffer before calling kern_shmctl */ if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &old, sizeof(old)))) goto done; ipcperm_old2new(&old.shm_perm, &buf.shm_perm); CP(old, buf, shm_segsz); CP(old, buf, shm_lpid); CP(old, buf, shm_cpid); CP(old, buf, shm_nattch); CP(old, buf, shm_atime); CP(old, buf, shm_dtime); CP(old, buf, shm_ctime); } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_STAT: ipcperm_new2old(&buf.shm_perm, &old.shm_perm); if (buf.shm_segsz > INT_MAX) old.shm_segsz = INT_MAX; else CP(buf, old, shm_segsz); CP(buf, old, shm_lpid); CP(buf, old, shm_cpid); if (buf.shm_nattch > SHRT_MAX) old.shm_nattch = SHRT_MAX; else CP(buf, old, shm_nattch); CP(buf, old, shm_atime); CP(buf, old, shm_dtime); CP(buf, old, shm_ctime); old.shm_internal = NULL; error = copyout(&old, uap->buf, sizeof(old)); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } #endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 || COMPAT_FREEBSD7 */ static int sysvshm_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = shminit(); if (error != 0) shmunload(); break; case MOD_UNLOAD: error = shmunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvshm_mod = { "sysvshm", &sysvshm_modload, NULL }; DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST); MODULE_VERSION(sysvshm, 1); Index: head/sys/kern/tty.c =================================================================== --- head/sys/kern/tty.c (revision 326270) +++ head/sys/kern/tty.c (revision 326271) @@ -1,2347 +1,2349 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include #include #include #include #include #include #include #include #ifdef COMPAT_43TTY #include #endif /* COMPAT_43TTY */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TTYDEFCHARS #include #undef TTYDEFCHARS #include #include #include static MALLOC_DEFINE(M_TTY, "tty", "tty device"); static void tty_rel_free(struct tty *tp); static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list); static struct sx tty_list_sx; SX_SYSINIT(tty_list, &tty_list_sx, "tty list"); static unsigned int tty_list_count = 0; /* Character device of /dev/console. */ static struct cdev *dev_console; static const char *dev_console_filename; /* * Flags that are supported and stored by this implementation. */ #define TTYSUP_IFLAG (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|\ INLCR|IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL) #define TTYSUP_OFLAG (OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET) #define TTYSUP_LFLAG (ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\ ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\ FLUSHO|NOKERNINFO|NOFLSH) #define TTYSUP_CFLAG (CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\ HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\ CDSR_OFLOW|CCAR_OFLOW) #define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT) static int tty_drainwait = 5 * 60; SYSCTL_INT(_kern, OID_AUTO, tty_drainwait, CTLFLAG_RWTUN, &tty_drainwait, 0, "Default output drain timeout in seconds"); /* * Set TTY buffer sizes. */ #define TTYBUF_MAX 65536 /* * Allocate buffer space if necessary, and set low watermarks, based on speed. * Note that the ttyxxxq_setsize() functions may drop and then reacquire the tty * lock during memory allocation. They will return ENXIO if the tty disappears * while unlocked. */ static int tty_watermarks(struct tty *tp) { size_t bs = 0; int error; /* Provide an input buffer for 2 seconds of data. */ if (tp->t_termios.c_cflag & CREAD) bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX); error = ttyinq_setsize(&tp->t_inq, tp, bs); if (error != 0) return (error); /* Set low watermark at 10% (when 90% is available). */ tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10; /* Provide an output buffer for 2 seconds of data. */ bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX); error = ttyoutq_setsize(&tp->t_outq, tp, bs); if (error != 0) return (error); /* Set low watermark at 10% (when 90% is available). */ tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10; return (0); } static int tty_drain(struct tty *tp, int leaving) { sbintime_t timeout_at; size_t bytes; int error; if (ttyhook_hashook(tp, getc_inject)) /* buffer is inaccessible */ return (0); /* * For close(), use the recent historic timeout of "1 second without * making progress". For tcdrain(), use t_drainwait as the timeout, * with zero meaning "no timeout" which gives POSIX behavior. */ if (leaving) timeout_at = getsbinuptime() + SBT_1S; else if (tp->t_drainwait != 0) timeout_at = getsbinuptime() + SBT_1S * tp->t_drainwait; else timeout_at = 0; /* * Poll the output buffer and the hardware for completion, at 10 Hz. * Polling is required for devices which are not able to signal an * interrupt when the transmitter becomes idle (most USB serial devs). * The unusual structure of this loop ensures we check for busy one more * time after tty_timedwait() returns EWOULDBLOCK, so that success has * higher priority than timeout if the IO completed in the last 100mS. */ error = 0; bytes = ttyoutq_bytesused(&tp->t_outq); for (;;) { if (ttyoutq_bytesused(&tp->t_outq) == 0 && !ttydevsw_busy(tp)) return (0); if (error != 0) return (error); ttydevsw_outwakeup(tp); error = tty_timedwait(tp, &tp->t_outwait, hz / 10); if (error != 0 && error != EWOULDBLOCK) return (error); else if (timeout_at == 0 || getsbinuptime() < timeout_at) error = 0; else if (leaving && ttyoutq_bytesused(&tp->t_outq) < bytes) { /* In close, making progress, grant an extra second. */ error = 0; timeout_at += SBT_1S; bytes = ttyoutq_bytesused(&tp->t_outq); } } } /* * Though ttydev_enter() and ttydev_leave() seem to be related, they * don't have to be used together. ttydev_enter() is used by the cdev * operations to prevent an actual operation from being processed when * the TTY has been abandoned. ttydev_leave() is used by ttydev_open() * and ttydev_close() to determine whether per-TTY data should be * deallocated. */ static __inline int ttydev_enter(struct tty *tp) { tty_lock(tp); if (tty_gone(tp) || !tty_opened(tp)) { /* Device is already gone. */ tty_unlock(tp); return (ENXIO); } return (0); } static void ttydev_leave(struct tty *tp) { tty_lock_assert(tp, MA_OWNED); if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) { /* Device is still opened somewhere. */ tty_unlock(tp); return; } tp->t_flags |= TF_OPENCLOSE; /* Stop asynchronous I/O. */ funsetown(&tp->t_sigio); /* Remove console TTY. */ if (constty == tp) constty_clear(); /* Drain any output. */ if (!tty_gone(tp)) tty_drain(tp, 1); ttydisc_close(tp); /* Free i/o queues now since they might be large. */ ttyinq_free(&tp->t_inq); tp->t_inlow = 0; ttyoutq_free(&tp->t_outq); tp->t_outlow = 0; knlist_clear(&tp->t_inpoll.si_note, 1); knlist_clear(&tp->t_outpoll.si_note, 1); if (!tty_gone(tp)) ttydevsw_close(tp); tp->t_flags &= ~TF_OPENCLOSE; cv_broadcast(&tp->t_dcdwait); tty_rel_free(tp); } /* * Operations that are exposed through the character device in /dev. */ static int ttydev_open(struct cdev *dev, int oflags, int devtype __unused, struct thread *td) { struct tty *tp; int error; tp = dev->si_drv1; error = 0; tty_lock(tp); if (tty_gone(tp)) { /* Device is already gone. */ tty_unlock(tp); return (ENXIO); } /* * Block when other processes are currently opening or closing * the TTY. */ while (tp->t_flags & TF_OPENCLOSE) { error = tty_wait(tp, &tp->t_dcdwait); if (error != 0) { tty_unlock(tp); return (error); } } tp->t_flags |= TF_OPENCLOSE; /* * Make sure the "tty" and "cua" device cannot be opened at the * same time. The console is a "tty" device. */ if (TTY_CALLOUT(tp, dev)) { if (tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) { error = EBUSY; goto done; } } else { if (tp->t_flags & TF_OPENED_OUT) { error = EBUSY; goto done; } } if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) { error = EBUSY; goto done; } if (!tty_opened(tp)) { /* Set proper termios flags. */ if (TTY_CALLOUT(tp, dev)) tp->t_termios = tp->t_termios_init_out; else tp->t_termios = tp->t_termios_init_in; ttydevsw_param(tp, &tp->t_termios); /* Prevent modem control on callout devices and /dev/console. */ if (TTY_CALLOUT(tp, dev) || dev == dev_console) tp->t_termios.c_cflag |= CLOCAL; ttydevsw_modem(tp, SER_DTR|SER_RTS, 0); error = ttydevsw_open(tp); if (error != 0) goto done; ttydisc_open(tp); error = tty_watermarks(tp); if (error != 0) goto done; } /* Wait for Carrier Detect. */ if ((oflags & O_NONBLOCK) == 0 && (tp->t_termios.c_cflag & CLOCAL) == 0) { while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) { error = tty_wait(tp, &tp->t_dcdwait); if (error != 0) goto done; } } if (dev == dev_console) tp->t_flags |= TF_OPENED_CONS; else if (TTY_CALLOUT(tp, dev)) tp->t_flags |= TF_OPENED_OUT; else tp->t_flags |= TF_OPENED_IN; MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 || (tp->t_flags & TF_OPENED_OUT) == 0); done: tp->t_flags &= ~TF_OPENCLOSE; cv_broadcast(&tp->t_dcdwait); ttydev_leave(tp); return (error); } static int ttydev_close(struct cdev *dev, int fflag, int devtype __unused, struct thread *td __unused) { struct tty *tp = dev->si_drv1; tty_lock(tp); /* * Don't actually close the device if it is being used as the * console. */ MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 || (tp->t_flags & TF_OPENED_OUT) == 0); if (dev == dev_console) tp->t_flags &= ~TF_OPENED_CONS; else tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT); if (tp->t_flags & TF_OPENED) { tty_unlock(tp); return (0); } /* If revoking, flush output now to avoid draining it later. */ if (fflag & FREVOKE) tty_flush(tp, FWRITE); tp->t_flags &= ~TF_EXCLUDE; /* Properly wake up threads that are stuck - revoke(). */ tp->t_revokecnt++; tty_wakeup(tp, FREAD|FWRITE); cv_broadcast(&tp->t_bgwait); cv_broadcast(&tp->t_dcdwait); ttydev_leave(tp); return (0); } static __inline int tty_is_ctty(struct tty *tp, struct proc *p) { tty_lock_assert(tp, MA_OWNED); return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT); } int tty_wait_background(struct tty *tp, struct thread *td, int sig) { struct proc *p = td->td_proc; struct pgrp *pg; ksiginfo_t ksi; int error; MPASS(sig == SIGTTIN || sig == SIGTTOU); tty_lock_assert(tp, MA_OWNED); for (;;) { PROC_LOCK(p); /* * The process should only sleep, when: * - This terminal is the controlling terminal * - Its process group is not the foreground process * group * - The parent process isn't waiting for the child to * exit * - the signal to send to the process isn't masked */ if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) { /* Allow the action to happen. */ PROC_UNLOCK(p); return (0); } if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) || SIGISMEMBER(td->td_sigmask, sig)) { /* Only allow them in write()/ioctl(). */ PROC_UNLOCK(p); return (sig == SIGTTOU ? 0 : EIO); } pg = p->p_pgrp; if (p->p_flag & P_PPWAIT || pg->pg_jobc == 0) { /* Don't allow the action to happen. */ PROC_UNLOCK(p); return (EIO); } PROC_UNLOCK(p); /* * Send the signal and sleep until we're the new * foreground process group. */ if (sig != 0) { ksiginfo_init(&ksi); ksi.ksi_code = SI_KERNEL; ksi.ksi_signo = sig; sig = 0; } PGRP_LOCK(pg); pgsignal(pg, ksi.ksi_signo, 1, &ksi); PGRP_UNLOCK(pg); error = tty_wait(tp, &tp->t_bgwait); if (error) return (error); } } static int ttydev_read(struct cdev *dev, struct uio *uio, int ioflag) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) goto done; error = ttydisc_read(tp, uio, ioflag); tty_unlock(tp); /* * The read() call should not throw an error when the device is * being destroyed. Silently convert it to an EOF. */ done: if (error == ENXIO) error = 0; return (error); } static int ttydev_write(struct cdev *dev, struct uio *uio, int ioflag) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) return (error); if (tp->t_termios.c_lflag & TOSTOP) { error = tty_wait_background(tp, curthread, SIGTTOU); if (error) goto done; } if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) { /* Allow non-blocking writes to bypass serialization. */ error = ttydisc_write(tp, uio, ioflag); } else { /* Serialize write() calls. */ while (tp->t_flags & TF_BUSY_OUT) { error = tty_wait(tp, &tp->t_outserwait); if (error) goto done; } tp->t_flags |= TF_BUSY_OUT; error = ttydisc_write(tp, uio, ioflag); tp->t_flags &= ~TF_BUSY_OUT; cv_signal(&tp->t_outserwait); } done: tty_unlock(tp); return (error); } static int ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) return (error); switch (cmd) { case TIOCCBRK: case TIOCCONS: case TIOCDRAIN: case TIOCEXCL: case TIOCFLUSH: case TIOCNXCL: case TIOCSBRK: case TIOCSCTTY: case TIOCSETA: case TIOCSETAF: case TIOCSETAW: case TIOCSPGRP: case TIOCSTART: case TIOCSTAT: case TIOCSTI: case TIOCSTOP: case TIOCSWINSZ: #if 0 case TIOCSDRAINWAIT: case TIOCSETD: #endif #ifdef COMPAT_43TTY case TIOCLBIC: case TIOCLBIS: case TIOCLSET: case TIOCSETC: case OTIOCSETD: case TIOCSETN: case TIOCSETP: case TIOCSLTC: #endif /* COMPAT_43TTY */ /* * If the ioctl() causes the TTY to be modified, let it * wait in the background. */ error = tty_wait_background(tp, curthread, SIGTTOU); if (error) goto done; } if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) { struct termios *old = &tp->t_termios; struct termios *new = (struct termios *)data; struct termios *lock = TTY_CALLOUT(tp, dev) ? &tp->t_termios_lock_out : &tp->t_termios_lock_in; int cc; /* * Lock state devices. Just overwrite the values of the * commands that are currently in use. */ new->c_iflag = (old->c_iflag & lock->c_iflag) | (new->c_iflag & ~lock->c_iflag); new->c_oflag = (old->c_oflag & lock->c_oflag) | (new->c_oflag & ~lock->c_oflag); new->c_cflag = (old->c_cflag & lock->c_cflag) | (new->c_cflag & ~lock->c_cflag); new->c_lflag = (old->c_lflag & lock->c_lflag) | (new->c_lflag & ~lock->c_lflag); for (cc = 0; cc < NCCS; ++cc) if (lock->c_cc[cc]) new->c_cc[cc] = old->c_cc[cc]; if (lock->c_ispeed) new->c_ispeed = old->c_ispeed; if (lock->c_ospeed) new->c_ospeed = old->c_ospeed; } error = tty_ioctl(tp, cmd, data, fflag, td); done: tty_unlock(tp); return (error); } static int ttydev_poll(struct cdev *dev, int events, struct thread *td) { struct tty *tp = dev->si_drv1; int error, revents = 0; error = ttydev_enter(tp); if (error) return ((events & (POLLIN|POLLRDNORM)) | POLLHUP); if (events & (POLLIN|POLLRDNORM)) { /* See if we can read something. */ if (ttydisc_read_poll(tp) > 0) revents |= events & (POLLIN|POLLRDNORM); } if (tp->t_flags & TF_ZOMBIE) { /* Hangup flag on zombie state. */ revents |= POLLHUP; } else if (events & (POLLOUT|POLLWRNORM)) { /* See if we can write something. */ if (ttydisc_write_poll(tp) > 0) revents |= events & (POLLOUT|POLLWRNORM); } if (revents == 0) { if (events & (POLLIN|POLLRDNORM)) selrecord(td, &tp->t_inpoll); if (events & (POLLOUT|POLLWRNORM)) selrecord(td, &tp->t_outpoll); } tty_unlock(tp); return (revents); } static int ttydev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { struct tty *tp = dev->si_drv1; int error; /* Handle mmap() through the driver. */ error = ttydev_enter(tp); if (error) return (-1); error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr); tty_unlock(tp); return (error); } /* * kqueue support. */ static void tty_kqops_read_detach(struct knote *kn) { struct tty *tp = kn->kn_hook; knlist_remove(&tp->t_inpoll.si_note, kn, 0); } static int tty_kqops_read_event(struct knote *kn, long hint __unused) { struct tty *tp = kn->kn_hook; tty_lock_assert(tp, MA_OWNED); if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_read_poll(tp); return (kn->kn_data > 0); } } static void tty_kqops_write_detach(struct knote *kn) { struct tty *tp = kn->kn_hook; knlist_remove(&tp->t_outpoll.si_note, kn, 0); } static int tty_kqops_write_event(struct knote *kn, long hint __unused) { struct tty *tp = kn->kn_hook; tty_lock_assert(tp, MA_OWNED); if (tty_gone(tp)) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_write_poll(tp); return (kn->kn_data > 0); } } static struct filterops tty_kqops_read = { .f_isfd = 1, .f_detach = tty_kqops_read_detach, .f_event = tty_kqops_read_event, }; static struct filterops tty_kqops_write = { .f_isfd = 1, .f_detach = tty_kqops_write_detach, .f_event = tty_kqops_write_event, }; static int ttydev_kqfilter(struct cdev *dev, struct knote *kn) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) return (error); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_hook = tp; kn->kn_fop = &tty_kqops_read; knlist_add(&tp->t_inpoll.si_note, kn, 1); break; case EVFILT_WRITE: kn->kn_hook = tp; kn->kn_fop = &tty_kqops_write; knlist_add(&tp->t_outpoll.si_note, kn, 1); break; default: error = EINVAL; break; } tty_unlock(tp); return (error); } static struct cdevsw ttydev_cdevsw = { .d_version = D_VERSION, .d_open = ttydev_open, .d_close = ttydev_close, .d_read = ttydev_read, .d_write = ttydev_write, .d_ioctl = ttydev_ioctl, .d_kqfilter = ttydev_kqfilter, .d_poll = ttydev_poll, .d_mmap = ttydev_mmap, .d_name = "ttydev", .d_flags = D_TTY, }; /* * Init/lock-state devices */ static int ttyil_open(struct cdev *dev, int oflags __unused, int devtype __unused, struct thread *td) { struct tty *tp; int error; tp = dev->si_drv1; error = 0; tty_lock(tp); if (tty_gone(tp)) error = ENODEV; tty_unlock(tp); return (error); } static int ttyil_close(struct cdev *dev __unused, int flag __unused, int mode __unused, struct thread *td __unused) { return (0); } static int ttyil_rdwr(struct cdev *dev __unused, struct uio *uio __unused, int ioflag __unused) { return (ENODEV); } static int ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct tty *tp = dev->si_drv1; int error; tty_lock(tp); if (tty_gone(tp)) { error = ENODEV; goto done; } error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td); if (error != ENOIOCTL) goto done; error = 0; switch (cmd) { case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ *(struct termios*)data = *(struct termios*)dev->si_drv2; break; case TIOCSETA: /* Set terminal flags through tcsetattr(). */ error = priv_check(td, PRIV_TTY_SETA); if (error) break; *(struct termios*)dev->si_drv2 = *(struct termios*)data; break; case TIOCGETD: *(int *)data = TTYDISC; break; case TIOCGWINSZ: bzero(data, sizeof(struct winsize)); break; default: error = ENOTTY; } done: tty_unlock(tp); return (error); } static struct cdevsw ttyil_cdevsw = { .d_version = D_VERSION, .d_open = ttyil_open, .d_close = ttyil_close, .d_read = ttyil_rdwr, .d_write = ttyil_rdwr, .d_ioctl = ttyil_ioctl, .d_name = "ttyil", .d_flags = D_TTY, }; static void tty_init_termios(struct tty *tp) { struct termios *t = &tp->t_termios_init_in; t->c_cflag = TTYDEF_CFLAG; t->c_iflag = TTYDEF_IFLAG; t->c_lflag = TTYDEF_LFLAG; t->c_oflag = TTYDEF_OFLAG; t->c_ispeed = TTYDEF_SPEED; t->c_ospeed = TTYDEF_SPEED; memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars); tp->t_termios_init_out = *t; } void tty_init_console(struct tty *tp, speed_t s) { struct termios *ti = &tp->t_termios_init_in; struct termios *to = &tp->t_termios_init_out; if (s != 0) { ti->c_ispeed = ti->c_ospeed = s; to->c_ispeed = to->c_ospeed = s; } ti->c_cflag |= CLOCAL; to->c_cflag |= CLOCAL; } /* * Standard device routine implementations, mostly meant for * pseudo-terminal device drivers. When a driver creates a new terminal * device class, missing routines are patched. */ static int ttydevsw_defopen(struct tty *tp __unused) { return (0); } static void ttydevsw_defclose(struct tty *tp __unused) { } static void ttydevsw_defoutwakeup(struct tty *tp __unused) { panic("Terminal device has output, while not implemented"); } static void ttydevsw_definwakeup(struct tty *tp __unused) { } static int ttydevsw_defioctl(struct tty *tp __unused, u_long cmd __unused, caddr_t data __unused, struct thread *td __unused) { return (ENOIOCTL); } static int ttydevsw_defcioctl(struct tty *tp __unused, int unit __unused, u_long cmd __unused, caddr_t data __unused, struct thread *td __unused) { return (ENOIOCTL); } static int ttydevsw_defparam(struct tty *tp __unused, struct termios *t) { /* * Allow the baud rate to be adjusted for pseudo-devices, but at * least restrict it to 115200 to prevent excessive buffer * usage. Also disallow 0, to prevent foot shooting. */ if (t->c_ispeed < B50) t->c_ispeed = B50; else if (t->c_ispeed > B115200) t->c_ispeed = B115200; if (t->c_ospeed < B50) t->c_ospeed = B50; else if (t->c_ospeed > B115200) t->c_ospeed = B115200; t->c_cflag |= CREAD; return (0); } static int ttydevsw_defmodem(struct tty *tp __unused, int sigon __unused, int sigoff __unused) { /* Simulate a carrier to make the TTY layer happy. */ return (SER_DCD); } static int ttydevsw_defmmap(struct tty *tp __unused, vm_ooffset_t offset __unused, vm_paddr_t *paddr __unused, int nprot __unused, vm_memattr_t *memattr __unused) { return (-1); } static void ttydevsw_defpktnotify(struct tty *tp __unused, char event __unused) { } static void ttydevsw_deffree(void *softc __unused) { panic("Terminal device freed without a free-handler"); } static bool ttydevsw_defbusy(struct tty *tp __unused) { return (FALSE); } /* * TTY allocation and deallocation. TTY devices can be deallocated when * the driver doesn't use it anymore, when the TTY isn't a session's * controlling TTY and when the device node isn't opened through devfs. */ struct tty * tty_alloc(struct ttydevsw *tsw, void *sc) { return (tty_alloc_mutex(tsw, sc, NULL)); } struct tty * tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex) { struct tty *tp; /* Make sure the driver defines all routines. */ #define PATCH_FUNC(x) do { \ if (tsw->tsw_ ## x == NULL) \ tsw->tsw_ ## x = ttydevsw_def ## x; \ } while (0) PATCH_FUNC(open); PATCH_FUNC(close); PATCH_FUNC(outwakeup); PATCH_FUNC(inwakeup); PATCH_FUNC(ioctl); PATCH_FUNC(cioctl); PATCH_FUNC(param); PATCH_FUNC(modem); PATCH_FUNC(mmap); PATCH_FUNC(pktnotify); PATCH_FUNC(free); PATCH_FUNC(busy); #undef PATCH_FUNC tp = malloc(sizeof(struct tty), M_TTY, M_WAITOK|M_ZERO); tp->t_devsw = tsw; tp->t_devswsoftc = sc; tp->t_flags = tsw->tsw_flags; tp->t_drainwait = tty_drainwait; tty_init_termios(tp); cv_init(&tp->t_inwait, "ttyin"); cv_init(&tp->t_outwait, "ttyout"); cv_init(&tp->t_outserwait, "ttyosr"); cv_init(&tp->t_bgwait, "ttybg"); cv_init(&tp->t_dcdwait, "ttydcd"); /* Allow drivers to use a custom mutex to lock the TTY. */ if (mutex != NULL) { tp->t_mtx = mutex; } else { tp->t_mtx = &tp->t_mtxobj; mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF); } knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx); return (tp); } static void tty_dealloc(void *arg) { struct tty *tp = arg; /* * ttyydev_leave() usually frees the i/o queues earlier, but it is * not always called between queue allocation and here. The queues * may be allocated by ioctls on a pty control device without the * corresponding pty slave device ever being open, or after it is * closed. */ ttyinq_free(&tp->t_inq); ttyoutq_free(&tp->t_outq); seldrain(&tp->t_inpoll); seldrain(&tp->t_outpoll); knlist_destroy(&tp->t_inpoll.si_note); knlist_destroy(&tp->t_outpoll.si_note); cv_destroy(&tp->t_inwait); cv_destroy(&tp->t_outwait); cv_destroy(&tp->t_bgwait); cv_destroy(&tp->t_dcdwait); cv_destroy(&tp->t_outserwait); if (tp->t_mtx == &tp->t_mtxobj) mtx_destroy(&tp->t_mtxobj); ttydevsw_free(tp); free(tp, M_TTY); } static void tty_rel_free(struct tty *tp) { struct cdev *dev; tty_lock_assert(tp, MA_OWNED); #define TF_ACTIVITY (TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE) if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) { /* TTY is still in use. */ tty_unlock(tp); return; } /* TTY can be deallocated. */ dev = tp->t_dev; tp->t_dev = NULL; tty_unlock(tp); if (dev != NULL) { sx_xlock(&tty_list_sx); TAILQ_REMOVE(&tty_list, tp, t_list); tty_list_count--; sx_xunlock(&tty_list_sx); destroy_dev_sched_cb(dev, tty_dealloc, tp); } } void tty_rel_pgrp(struct tty *tp, struct pgrp *pg) { MPASS(tp->t_sessioncnt > 0); tty_lock_assert(tp, MA_OWNED); if (tp->t_pgrp == pg) tp->t_pgrp = NULL; tty_unlock(tp); } void tty_rel_sess(struct tty *tp, struct session *sess) { MPASS(tp->t_sessioncnt > 0); /* Current session has left. */ if (tp->t_session == sess) { tp->t_session = NULL; MPASS(tp->t_pgrp == NULL); } tp->t_sessioncnt--; tty_rel_free(tp); } void tty_rel_gone(struct tty *tp) { MPASS(!tty_gone(tp)); /* Simulate carrier removal. */ ttydisc_modem(tp, 0); /* Wake up all blocked threads. */ tty_wakeup(tp, FREAD|FWRITE); cv_broadcast(&tp->t_bgwait); cv_broadcast(&tp->t_dcdwait); tp->t_flags |= TF_GONE; tty_rel_free(tp); } /* * Exposing information about current TTY's through sysctl */ static void tty_to_xtty(struct tty *tp, struct xtty *xt) { tty_lock_assert(tp, MA_OWNED); xt->xt_size = sizeof(struct xtty); xt->xt_insize = ttyinq_getsize(&tp->t_inq); xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq); xt->xt_inlc = ttyinq_bytesline(&tp->t_inq); xt->xt_inlow = tp->t_inlow; xt->xt_outsize = ttyoutq_getsize(&tp->t_outq); xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq); xt->xt_outlow = tp->t_outlow; xt->xt_column = tp->t_column; xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0; xt->xt_flags = tp->t_flags; xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : (uint32_t)NODEV; } static int sysctl_kern_ttys(SYSCTL_HANDLER_ARGS) { unsigned long lsize; struct xtty *xtlist, *xt; struct tty *tp; int error; sx_slock(&tty_list_sx); lsize = tty_list_count * sizeof(struct xtty); if (lsize == 0) { sx_sunlock(&tty_list_sx); return (0); } xtlist = xt = malloc(lsize, M_TTY, M_WAITOK); TAILQ_FOREACH(tp, &tty_list, t_list) { tty_lock(tp); tty_to_xtty(tp, xt); tty_unlock(tp); xt++; } sx_sunlock(&tty_list_sx); error = SYSCTL_OUT(req, xtlist, lsize); free(xtlist, M_TTY); return (error); } SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs"); /* * Device node creation. Device has been set up, now we can expose it to * the user. */ int tty_makedevf(struct tty *tp, struct ucred *cred, int flags, const char *fmt, ...) { va_list ap; struct make_dev_args args; struct cdev *dev, *init, *lock, *cua, *cinit, *clock; const char *prefix = "tty"; char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */ uid_t uid; gid_t gid; mode_t mode; int error; /* Remove "tty" prefix from devices like PTY's. */ if (tp->t_flags & TF_NOPREFIX) prefix = ""; va_start(ap, fmt); vsnrprintf(name, sizeof name, 32, fmt, ap); va_end(ap); if (cred == NULL) { /* System device. */ uid = UID_ROOT; gid = GID_WHEEL; mode = S_IRUSR|S_IWUSR; } else { /* User device. */ uid = cred->cr_ruid; gid = GID_TTY; mode = S_IRUSR|S_IWUSR|S_IWGRP; } flags = flags & TTYMK_CLONING ? MAKEDEV_REF : 0; flags |= MAKEDEV_CHECKNAME; /* Master call-in device. */ make_dev_args_init(&args); args.mda_flags = flags; args.mda_devsw = &ttydev_cdevsw; args.mda_cr = cred; args.mda_uid = uid; args.mda_gid = gid; args.mda_mode = mode; args.mda_si_drv1 = tp; error = make_dev_s(&args, &dev, "%s%s", prefix, name); if (error != 0) return (error); tp->t_dev = dev; init = lock = cua = cinit = clock = NULL; /* Slave call-in devices. */ if (tp->t_flags & TF_INITLOCK) { args.mda_devsw = &ttyil_cdevsw; args.mda_unit = TTYUNIT_INIT; args.mda_si_drv1 = tp; args.mda_si_drv2 = &tp->t_termios_init_in; error = make_dev_s(&args, &init, "%s%s.init", prefix, name); if (error != 0) goto fail; dev_depends(dev, init); args.mda_unit = TTYUNIT_LOCK; args.mda_si_drv2 = &tp->t_termios_lock_in; error = make_dev_s(&args, &lock, "%s%s.lock", prefix, name); if (error != 0) goto fail; dev_depends(dev, lock); } /* Call-out devices. */ if (tp->t_flags & TF_CALLOUT) { make_dev_args_init(&args); args.mda_flags = flags; args.mda_devsw = &ttydev_cdevsw; args.mda_cr = cred; args.mda_uid = UID_UUCP; args.mda_gid = GID_DIALER; args.mda_mode = 0660; args.mda_unit = TTYUNIT_CALLOUT; args.mda_si_drv1 = tp; error = make_dev_s(&args, &cua, "cua%s", name); if (error != 0) goto fail; dev_depends(dev, cua); /* Slave call-out devices. */ if (tp->t_flags & TF_INITLOCK) { args.mda_devsw = &ttyil_cdevsw; args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_INIT; args.mda_si_drv2 = &tp->t_termios_init_out; error = make_dev_s(&args, &cinit, "cua%s.init", name); if (error != 0) goto fail; dev_depends(dev, cinit); args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_LOCK; args.mda_si_drv2 = &tp->t_termios_lock_out; error = make_dev_s(&args, &clock, "cua%s.lock", name); if (error != 0) goto fail; dev_depends(dev, clock); } } sx_xlock(&tty_list_sx); TAILQ_INSERT_TAIL(&tty_list, tp, t_list); tty_list_count++; sx_xunlock(&tty_list_sx); return (0); fail: destroy_dev(dev); if (init) destroy_dev(init); if (lock) destroy_dev(lock); if (cinit) destroy_dev(cinit); if (clock) destroy_dev(clock); return (error); } /* * Signalling processes. */ void tty_signal_sessleader(struct tty *tp, int sig) { struct proc *p; tty_lock_assert(tp, MA_OWNED); MPASS(sig >= 1 && sig < NSIG); /* Make signals start output again. */ tp->t_flags &= ~TF_STOPPED; if (tp->t_session != NULL && tp->t_session->s_leader != NULL) { p = tp->t_session->s_leader; PROC_LOCK(p); kern_psignal(p, sig); PROC_UNLOCK(p); } } void tty_signal_pgrp(struct tty *tp, int sig) { ksiginfo_t ksi; tty_lock_assert(tp, MA_OWNED); MPASS(sig >= 1 && sig < NSIG); /* Make signals start output again. */ tp->t_flags &= ~TF_STOPPED; if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO)) tty_info(tp); if (tp->t_pgrp != NULL) { ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, sig, 1, &ksi); PGRP_UNLOCK(tp->t_pgrp); } } void tty_wakeup(struct tty *tp, int flags) { if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL) pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL)); if (flags & FWRITE) { cv_broadcast(&tp->t_outwait); selwakeup(&tp->t_outpoll); KNOTE_LOCKED(&tp->t_outpoll.si_note, 0); } if (flags & FREAD) { cv_broadcast(&tp->t_inwait); selwakeup(&tp->t_inpoll); KNOTE_LOCKED(&tp->t_inpoll.si_note, 0); } } int tty_wait(struct tty *tp, struct cv *cv) { int error; int revokecnt = tp->t_revokecnt; tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED); MPASS(!tty_gone(tp)); error = cv_wait_sig(cv, tp->t_mtx); /* Bail out when the device slipped away. */ if (tty_gone(tp)) return (ENXIO); /* Restart the system call when we may have been revoked. */ if (tp->t_revokecnt != revokecnt) return (ERESTART); return (error); } int tty_timedwait(struct tty *tp, struct cv *cv, int hz) { int error; int revokecnt = tp->t_revokecnt; tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED); MPASS(!tty_gone(tp)); error = cv_timedwait_sig(cv, tp->t_mtx, hz); /* Bail out when the device slipped away. */ if (tty_gone(tp)) return (ENXIO); /* Restart the system call when we may have been revoked. */ if (tp->t_revokecnt != revokecnt) return (ERESTART); return (error); } void tty_flush(struct tty *tp, int flags) { if (flags & FWRITE) { tp->t_flags &= ~TF_HIWAT_OUT; ttyoutq_flush(&tp->t_outq); tty_wakeup(tp, FWRITE); if (!tty_gone(tp)) { ttydevsw_outwakeup(tp); ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE); } } if (flags & FREAD) { tty_hiwat_in_unblock(tp); ttyinq_flush(&tp->t_inq); tty_wakeup(tp, FREAD); if (!tty_gone(tp)) { ttydevsw_inwakeup(tp); ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD); } } } void tty_set_winsize(struct tty *tp, const struct winsize *wsz) { if (memcmp(&tp->t_winsize, wsz, sizeof(*wsz)) == 0) return; tp->t_winsize = *wsz; tty_signal_pgrp(tp, SIGWINCH); } static int tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td) { int error; switch (cmd) { /* * Modem commands. * The SER_* and TIOCM_* flags are the same, but one bit * shifted. I don't know why. */ case TIOCSDTR: ttydevsw_modem(tp, SER_DTR, 0); return (0); case TIOCCDTR: ttydevsw_modem(tp, 0, SER_DTR); return (0); case TIOCMSET: { int bits = *(int *)data; ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1); return (0); } case TIOCMBIS: { int bits = *(int *)data; ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0); return (0); } case TIOCMBIC: { int bits = *(int *)data; ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1); return (0); } case TIOCMGET: *(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1); return (0); case FIOASYNC: if (*(int *)data) tp->t_flags |= TF_ASYNC; else tp->t_flags &= ~TF_ASYNC; return (0); case FIONBIO: /* This device supports non-blocking operation. */ return (0); case FIONREAD: *(int *)data = ttyinq_bytescanonicalized(&tp->t_inq); return (0); case FIONWRITE: case TIOCOUTQ: *(int *)data = ttyoutq_bytesused(&tp->t_outq); return (0); case FIOSETOWN: if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc)) /* Not allowed to set ownership. */ return (ENOTTY); /* Temporarily unlock the TTY to set ownership. */ tty_unlock(tp); error = fsetown(*(int *)data, &tp->t_sigio); tty_lock(tp); return (error); case FIOGETOWN: if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc)) /* Not allowed to set ownership. */ return (ENOTTY); /* Get ownership. */ *(int *)data = fgetown(&tp->t_sigio); return (0); case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ *(struct termios*)data = tp->t_termios; return (0); case TIOCSETA: case TIOCSETAW: case TIOCSETAF: { struct termios *t = data; /* * Who makes up these funny rules? According to POSIX, * input baud rate is set equal to the output baud rate * when zero. */ if (t->c_ispeed == 0) t->c_ispeed = t->c_ospeed; /* Discard any unsupported bits. */ t->c_iflag &= TTYSUP_IFLAG; t->c_oflag &= TTYSUP_OFLAG; t->c_lflag &= TTYSUP_LFLAG; t->c_cflag &= TTYSUP_CFLAG; /* Set terminal flags through tcsetattr(). */ if (cmd == TIOCSETAW || cmd == TIOCSETAF) { error = tty_drain(tp, 0); if (error) return (error); if (cmd == TIOCSETAF) tty_flush(tp, FREAD); } /* * Only call param() when the flags really change. */ if ((t->c_cflag & CIGNORE) == 0 && (tp->t_termios.c_cflag != t->c_cflag || ((tp->t_termios.c_iflag ^ t->c_iflag) & (IXON|IXOFF|IXANY)) || tp->t_termios.c_ispeed != t->c_ispeed || tp->t_termios.c_ospeed != t->c_ospeed)) { error = ttydevsw_param(tp, t); if (error) return (error); /* XXX: CLOCAL? */ tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE; tp->t_termios.c_ispeed = t->c_ispeed; tp->t_termios.c_ospeed = t->c_ospeed; /* Baud rate has changed - update watermarks. */ error = tty_watermarks(tp); if (error) return (error); } /* Copy new non-device driver parameters. */ tp->t_termios.c_iflag = t->c_iflag; tp->t_termios.c_oflag = t->c_oflag; tp->t_termios.c_lflag = t->c_lflag; memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc); ttydisc_optimize(tp); if ((t->c_lflag & ICANON) == 0) { /* * When in non-canonical mode, wake up all * readers. Canonicalize any partial input. VMIN * and VTIME could also be adjusted. */ ttyinq_canonicalize(&tp->t_inq); tty_wakeup(tp, FREAD); } /* * For packet mode: notify the PTY consumer that VSTOP * and VSTART may have been changed. */ if (tp->t_termios.c_iflag & IXON && tp->t_termios.c_cc[VSTOP] == CTRL('S') && tp->t_termios.c_cc[VSTART] == CTRL('Q')) ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP); else ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP); return (0); } case TIOCGETD: /* For compatibility - we only support TTYDISC. */ *(int *)data = TTYDISC; return (0); case TIOCGPGRP: if (!tty_is_ctty(tp, td->td_proc)) return (ENOTTY); if (tp->t_pgrp != NULL) *(int *)data = tp->t_pgrp->pg_id; else *(int *)data = NO_PID; return (0); case TIOCGSID: if (!tty_is_ctty(tp, td->td_proc)) return (ENOTTY); MPASS(tp->t_session); *(int *)data = tp->t_session->s_sid; return (0); case TIOCSCTTY: { struct proc *p = td->td_proc; /* XXX: This looks awful. */ tty_unlock(tp); sx_xlock(&proctree_lock); tty_lock(tp); if (!SESS_LEADER(p)) { /* Only the session leader may do this. */ sx_xunlock(&proctree_lock); return (EPERM); } if (tp->t_session != NULL && tp->t_session == p->p_session) { /* This is already our controlling TTY. */ sx_xunlock(&proctree_lock); return (0); } if (p->p_session->s_ttyp != NULL || (tp->t_session != NULL && tp->t_session->s_ttyvp != NULL && tp->t_session->s_ttyvp->v_type != VBAD)) { /* * There is already a relation between a TTY and * a session, or the caller is not the session * leader. * * Allow the TTY to be stolen when the vnode is * invalid, but the reference to the TTY is * still active. This allows immediate reuse of * TTYs of which the session leader has been * killed or the TTY revoked. */ sx_xunlock(&proctree_lock); return (EPERM); } /* Connect the session to the TTY. */ tp->t_session = p->p_session; tp->t_session->s_ttyp = tp; tp->t_sessioncnt++; sx_xunlock(&proctree_lock); /* Assign foreground process group. */ tp->t_pgrp = p->p_pgrp; PROC_LOCK(p); p->p_flag |= P_CONTROLT; PROC_UNLOCK(p); return (0); } case TIOCSPGRP: { struct pgrp *pg; /* * XXX: Temporarily unlock the TTY to locate the process * group. This code would be lot nicer if we would ever * decompose proctree_lock. */ tty_unlock(tp); sx_slock(&proctree_lock); pg = pgfind(*(int *)data); if (pg != NULL) PGRP_UNLOCK(pg); if (pg == NULL || pg->pg_session != td->td_proc->p_session) { sx_sunlock(&proctree_lock); tty_lock(tp); return (EPERM); } tty_lock(tp); /* * Determine if this TTY is the controlling TTY after * relocking the TTY. */ if (!tty_is_ctty(tp, td->td_proc)) { sx_sunlock(&proctree_lock); return (ENOTTY); } tp->t_pgrp = pg; sx_sunlock(&proctree_lock); /* Wake up the background process groups. */ cv_broadcast(&tp->t_bgwait); return (0); } case TIOCFLUSH: { int flags = *(int *)data; if (flags == 0) flags = (FREAD|FWRITE); else flags &= (FREAD|FWRITE); tty_flush(tp, flags); return (0); } case TIOCDRAIN: /* Drain TTY output. */ return tty_drain(tp, 0); case TIOCGDRAINWAIT: *(int *)data = tp->t_drainwait; return (0); case TIOCSDRAINWAIT: error = priv_check(td, PRIV_TTY_DRAINWAIT); if (error == 0) tp->t_drainwait = *(int *)data; return (error); case TIOCCONS: /* Set terminal as console TTY. */ if (*(int *)data) { error = priv_check(td, PRIV_TTY_CONSOLE); if (error) return (error); /* * XXX: constty should really need to be locked! * XXX: allow disconnected constty's to be stolen! */ if (constty == tp) return (0); if (constty != NULL) return (EBUSY); tty_unlock(tp); constty_set(tp); tty_lock(tp); } else if (constty == tp) { constty_clear(); } return (0); case TIOCGWINSZ: /* Obtain window size. */ *(struct winsize*)data = tp->t_winsize; return (0); case TIOCSWINSZ: /* Set window size. */ tty_set_winsize(tp, data); return (0); case TIOCEXCL: tp->t_flags |= TF_EXCLUDE; return (0); case TIOCNXCL: tp->t_flags &= ~TF_EXCLUDE; return (0); case TIOCSTOP: tp->t_flags |= TF_STOPPED; ttydevsw_pktnotify(tp, TIOCPKT_STOP); return (0); case TIOCSTART: tp->t_flags &= ~TF_STOPPED; ttydevsw_outwakeup(tp); ttydevsw_pktnotify(tp, TIOCPKT_START); return (0); case TIOCSTAT: tty_info(tp); return (0); case TIOCSTI: if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI)) return (EPERM); if (!tty_is_ctty(tp, td->td_proc) && priv_check(td, PRIV_TTY_STI)) return (EACCES); ttydisc_rint(tp, *(char *)data, 0); ttydisc_rint_done(tp); return (0); } #ifdef COMPAT_43TTY return tty_ioctl_compat(tp, cmd, data, fflag, td); #else /* !COMPAT_43TTY */ return (ENOIOCTL); #endif /* COMPAT_43TTY */ } int tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td) { int error; tty_lock_assert(tp, MA_OWNED); if (tty_gone(tp)) return (ENXIO); error = ttydevsw_ioctl(tp, cmd, data, td); if (error == ENOIOCTL) error = tty_generic_ioctl(tp, cmd, data, fflag, td); return (error); } dev_t tty_udev(struct tty *tp) { if (tp->t_dev) return (dev2udev(tp->t_dev)); else return (NODEV); } int tty_checkoutq(struct tty *tp) { /* 256 bytes should be enough to print a log message. */ return (ttyoutq_bytesleft(&tp->t_outq) >= 256); } void tty_hiwat_in_block(struct tty *tp) { if ((tp->t_flags & TF_HIWAT_IN) == 0 && tp->t_termios.c_iflag & IXOFF && tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) { /* * Input flow control. Only enter the high watermark when we * can successfully store the VSTOP character. */ if (ttyoutq_write_nofrag(&tp->t_outq, &tp->t_termios.c_cc[VSTOP], 1) == 0) tp->t_flags |= TF_HIWAT_IN; } else { /* No input flow control. */ tp->t_flags |= TF_HIWAT_IN; } } void tty_hiwat_in_unblock(struct tty *tp) { if (tp->t_flags & TF_HIWAT_IN && tp->t_termios.c_iflag & IXOFF && tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) { /* * Input flow control. Only leave the high watermark when we * can successfully store the VSTART character. */ if (ttyoutq_write_nofrag(&tp->t_outq, &tp->t_termios.c_cc[VSTART], 1) == 0) tp->t_flags &= ~TF_HIWAT_IN; } else { /* No input flow control. */ tp->t_flags &= ~TF_HIWAT_IN; } if (!tty_gone(tp)) ttydevsw_inwakeup(tp); } /* * TTY hooks interface. */ static int ttyhook_defrint(struct tty *tp, char c, int flags) { if (ttyhook_rint_bypass(tp, &c, 1) != 1) return (-1); return (0); } int ttyhook_register(struct tty **rtp, struct proc *p, int fd, struct ttyhook *th, void *softc) { struct tty *tp; struct file *fp; struct cdev *dev; struct cdevsw *cdp; struct filedesc *fdp; cap_rights_t rights; int error, ref; /* Validate the file descriptor. */ fdp = p->p_fd; error = fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_TTYHOOK), &fp, NULL); if (error != 0) return (error); if (fp->f_ops == &badfileops) { error = EBADF; goto done1; } /* * Make sure the vnode is bound to a character device. * Unlocked check for the vnode type is ok there, because we * only shall prevent calling devvn_refthread on the file that * never has been opened over a character device. */ if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) { error = EINVAL; goto done1; } /* Make sure it is a TTY. */ cdp = devvn_refthread(fp->f_vnode, &dev, &ref); if (cdp == NULL) { error = ENXIO; goto done1; } if (dev != fp->f_data) { error = ENXIO; goto done2; } if (cdp != &ttydev_cdevsw) { error = ENOTTY; goto done2; } tp = dev->si_drv1; /* Try to attach the hook to the TTY. */ error = EBUSY; tty_lock(tp); MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0)); if (tp->t_flags & TF_HOOK) goto done3; tp->t_flags |= TF_HOOK; tp->t_hook = th; tp->t_hooksoftc = softc; *rtp = tp; error = 0; /* Maybe we can switch into bypass mode now. */ ttydisc_optimize(tp); /* Silently convert rint() calls to rint_bypass() when possible. */ if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass)) th->th_rint = ttyhook_defrint; done3: tty_unlock(tp); done2: dev_relthread(dev, ref); done1: fdrop(fp, curthread); return (error); } void ttyhook_unregister(struct tty *tp) { tty_lock_assert(tp, MA_OWNED); MPASS(tp->t_flags & TF_HOOK); /* Disconnect the hook. */ tp->t_flags &= ~TF_HOOK; tp->t_hook = NULL; /* Maybe we need to leave bypass mode. */ ttydisc_optimize(tp); /* Maybe deallocate the TTY as well. */ tty_rel_free(tp); } /* * /dev/console handling. */ static int ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct tty *tp; /* System has no console device. */ if (dev_console_filename == NULL) return (ENXIO); /* Look up corresponding TTY by device name. */ sx_slock(&tty_list_sx); TAILQ_FOREACH(tp, &tty_list, t_list) { if (strcmp(dev_console_filename, tty_devname(tp)) == 0) { dev_console->si_drv1 = tp; break; } } sx_sunlock(&tty_list_sx); /* System console has no TTY associated. */ if (dev_console->si_drv1 == NULL) return (ENXIO); return (ttydev_open(dev, oflags, devtype, td)); } static int ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag) { log_console(uio); return (ttydev_write(dev, uio, ioflag)); } /* * /dev/console is a little different than normal TTY's. When opened, * it determines which TTY to use. When data gets written to it, it * will be logged in the kernel message buffer. */ static struct cdevsw ttyconsdev_cdevsw = { .d_version = D_VERSION, .d_open = ttyconsdev_open, .d_close = ttydev_close, .d_read = ttydev_read, .d_write = ttyconsdev_write, .d_ioctl = ttydev_ioctl, .d_kqfilter = ttydev_kqfilter, .d_poll = ttydev_poll, .d_mmap = ttydev_mmap, .d_name = "ttyconsdev", .d_flags = D_TTY, }; static void ttyconsdev_init(void *unused __unused) { dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "console"); } SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL); void ttyconsdev_select(const char *name) { dev_console_filename = name; } /* * Debugging routines. */ #include "opt_ddb.h" #ifdef DDB #include #include static const struct { int flag; char val; } ttystates[] = { #if 0 { TF_NOPREFIX, 'N' }, #endif { TF_INITLOCK, 'I' }, { TF_CALLOUT, 'C' }, /* Keep these together -> 'Oi' and 'Oo'. */ { TF_OPENED, 'O' }, { TF_OPENED_IN, 'i' }, { TF_OPENED_OUT, 'o' }, { TF_OPENED_CONS, 'c' }, { TF_GONE, 'G' }, { TF_OPENCLOSE, 'B' }, { TF_ASYNC, 'Y' }, { TF_LITERAL, 'L' }, /* Keep these together -> 'Hi' and 'Ho'. */ { TF_HIWAT, 'H' }, { TF_HIWAT_IN, 'i' }, { TF_HIWAT_OUT, 'o' }, { TF_STOPPED, 'S' }, { TF_EXCLUDE, 'X' }, { TF_BYPASS, 'l' }, { TF_ZOMBIE, 'Z' }, { TF_HOOK, 's' }, /* Keep these together -> 'bi' and 'bo'. */ { TF_BUSY, 'b' }, { TF_BUSY_IN, 'i' }, { TF_BUSY_OUT, 'o' }, { 0, '\0'}, }; #define TTY_FLAG_BITS \ "\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN" \ "\5OPENED_OUT\6OPENED_CONS\7GONE\10OPENCLOSE" \ "\11ASYNC\12LITERAL\13HIWAT_IN\14HIWAT_OUT" \ "\15STOPPED\16EXCLUDE\17BYPASS\20ZOMBIE" \ "\21HOOK\22BUSY_IN\23BUSY_OUT" #define DB_PRINTSYM(name, addr) \ db_printf("%s " #name ": ", sep); \ db_printsym((db_addr_t) addr, DB_STGY_ANY); \ db_printf("\n"); static void _db_show_devsw(const char *sep, const struct ttydevsw *tsw) { db_printf("%sdevsw: ", sep); db_printsym((db_addr_t)tsw, DB_STGY_ANY); db_printf(" (%p)\n", tsw); DB_PRINTSYM(open, tsw->tsw_open); DB_PRINTSYM(close, tsw->tsw_close); DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup); DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup); DB_PRINTSYM(ioctl, tsw->tsw_ioctl); DB_PRINTSYM(param, tsw->tsw_param); DB_PRINTSYM(modem, tsw->tsw_modem); DB_PRINTSYM(mmap, tsw->tsw_mmap); DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify); DB_PRINTSYM(free, tsw->tsw_free); } static void _db_show_hooks(const char *sep, const struct ttyhook *th) { db_printf("%shook: ", sep); db_printsym((db_addr_t)th, DB_STGY_ANY); db_printf(" (%p)\n", th); if (th == NULL) return; DB_PRINTSYM(rint, th->th_rint); DB_PRINTSYM(rint_bypass, th->th_rint_bypass); DB_PRINTSYM(rint_done, th->th_rint_done); DB_PRINTSYM(rint_poll, th->th_rint_poll); DB_PRINTSYM(getc_inject, th->th_getc_inject); DB_PRINTSYM(getc_capture, th->th_getc_capture); DB_PRINTSYM(getc_poll, th->th_getc_poll); DB_PRINTSYM(close, th->th_close); } static void _db_show_termios(const char *name, const struct termios *t) { db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x " "lflag 0x%x ispeed %u ospeed %u\n", name, t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag, t->c_ispeed, t->c_ospeed); } /* DDB command to show TTY statistics. */ DB_SHOW_COMMAND(tty, db_show_tty) { struct tty *tp; if (!have_addr) { db_printf("usage: show tty \n"); return; } tp = (struct tty *)addr; db_printf("%p: %s\n", tp, tty_devname(tp)); db_printf("\tmtx: %p\n", tp->t_mtx); db_printf("\tflags: 0x%b\n", tp->t_flags, TTY_FLAG_BITS); db_printf("\trevokecnt: %u\n", tp->t_revokecnt); /* Buffering mechanisms. */ db_printf("\tinq: %p begin %u linestart %u reprint %u end %u " "nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin, tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end, tp->t_inq.ti_nblocks, tp->t_inq.ti_quota); db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n", &tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end, tp->t_outq.to_nblocks, tp->t_outq.to_quota); db_printf("\tinlow: %zu\n", tp->t_inlow); db_printf("\toutlow: %zu\n", tp->t_outlow); _db_show_termios("\ttermios", &tp->t_termios); db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n", tp->t_winsize.ws_row, tp->t_winsize.ws_col, tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel); db_printf("\tcolumn: %u\n", tp->t_column); db_printf("\twritepos: %u\n", tp->t_writepos); db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags); /* Init/lock-state devices. */ _db_show_termios("\ttermios_init_in", &tp->t_termios_init_in); _db_show_termios("\ttermios_init_out", &tp->t_termios_init_out); _db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in); _db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out); /* Hooks */ _db_show_devsw("\t", tp->t_devsw); _db_show_hooks("\t", tp->t_hook); /* Process info. */ db_printf("\tpgrp: %p gid %d jobc %d\n", tp->t_pgrp, tp->t_pgrp ? tp->t_pgrp->pg_id : 0, tp->t_pgrp ? tp->t_pgrp->pg_jobc : 0); db_printf("\tsession: %p", tp->t_session); if (tp->t_session != NULL) db_printf(" count %u leader %p tty %p sid %d login %s", tp->t_session->s_count, tp->t_session->s_leader, tp->t_session->s_ttyp, tp->t_session->s_sid, tp->t_session->s_login); db_printf("\n"); db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt); db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc); db_printf("\thooksoftc: %p\n", tp->t_hooksoftc); db_printf("\tdev: %p\n", tp->t_dev); } /* DDB command to list TTYs. */ DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys) { struct tty *tp; size_t isiz, osiz; int i, j; /* Make the output look like `pstat -t'. */ db_printf("PTR "); #if defined(__LP64__) db_printf(" "); #endif db_printf(" LINE INQ CAN LIN LOW OUTQ USE LOW " "COL SESS PGID STATE\n"); TAILQ_FOREACH(tp, &tty_list, t_list) { isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE; osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE; db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d " "%5d ", tp, tty_devname(tp), isiz, tp->t_inq.ti_linestart - tp->t_inq.ti_begin, tp->t_inq.ti_end - tp->t_inq.ti_linestart, isiz - tp->t_inlow, osiz, tp->t_outq.to_end - tp->t_outq.to_begin, osiz - tp->t_outlow, MIN(tp->t_column, 99999), tp->t_session ? tp->t_session->s_sid : 0, tp->t_pgrp ? tp->t_pgrp->pg_id : 0); /* Flag bits. */ for (i = j = 0; ttystates[i].flag; i++) if (tp->t_flags & ttystates[i].flag) { db_printf("%c", ttystates[i].val); j++; } if (j == 0) db_printf("-"); db_printf("\n"); } } #endif /* DDB */ Index: head/sys/kern/tty_inq.c =================================================================== --- head/sys/kern/tty_inq.c (revision 326270) +++ head/sys/kern/tty_inq.c (revision 326271) @@ -1,495 +1,497 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* * TTY input queue buffering. * * Unlike the output queue, the input queue has more features that are * needed to properly implement various features offered by the TTY * interface: * * - Data can be removed from the tail of the queue, which is used to * implement backspace. * - Once in a while, input has to be `canonicalized'. When ICANON is * turned on, this will be done after a CR has been inserted. * Otherwise, it should be done after any character has been inserted. * - The input queue can store one bit per byte, called the quoting bit. * This bit is used by TTYDISC to make backspace work on quoted * characters. * * In most cases, there is probably less input than output, so unlike * the outq, we'll stick to 128 byte blocks here. */ static int ttyinq_flush_secure = 1; SYSCTL_INT(_kern, OID_AUTO, tty_inq_flush_secure, CTLFLAG_RW, &ttyinq_flush_secure, 0, "Zero buffers while flushing"); #define TTYINQ_QUOTESIZE (TTYINQ_DATASIZE / BMSIZE) #define BMSIZE 32 #define GETBIT(tib,boff) \ ((tib)->tib_quotes[(boff) / BMSIZE] & (1 << ((boff) % BMSIZE))) #define SETBIT(tib,boff) \ ((tib)->tib_quotes[(boff) / BMSIZE] |= (1 << ((boff) % BMSIZE))) #define CLRBIT(tib,boff) \ ((tib)->tib_quotes[(boff) / BMSIZE] &= ~(1 << ((boff) % BMSIZE))) struct ttyinq_block { struct ttyinq_block *tib_prev; struct ttyinq_block *tib_next; uint32_t tib_quotes[TTYINQ_QUOTESIZE]; char tib_data[TTYINQ_DATASIZE]; }; static uma_zone_t ttyinq_zone; #define TTYINQ_INSERT_TAIL(ti, tib) do { \ if (ti->ti_end == 0) { \ tib->tib_prev = NULL; \ tib->tib_next = ti->ti_firstblock; \ ti->ti_firstblock = tib; \ } else { \ tib->tib_prev = ti->ti_lastblock; \ tib->tib_next = ti->ti_lastblock->tib_next; \ ti->ti_lastblock->tib_next = tib; \ } \ if (tib->tib_next != NULL) \ tib->tib_next->tib_prev = tib; \ ti->ti_nblocks++; \ } while (0) #define TTYINQ_REMOVE_HEAD(ti) do { \ ti->ti_firstblock = ti->ti_firstblock->tib_next; \ if (ti->ti_firstblock != NULL) \ ti->ti_firstblock->tib_prev = NULL; \ ti->ti_nblocks--; \ } while (0) #define TTYINQ_RECYCLE(ti, tib) do { \ if (ti->ti_quota <= ti->ti_nblocks) \ uma_zfree(ttyinq_zone, tib); \ else \ TTYINQ_INSERT_TAIL(ti, tib); \ } while (0) int ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t size) { struct ttyinq_block *tib; ti->ti_quota = howmany(size, TTYINQ_DATASIZE); while (ti->ti_quota > ti->ti_nblocks) { /* * List is getting bigger. * Add new blocks to the tail of the list. * * We must unlock the TTY temporarily, because we need * to allocate memory. This won't be a problem, because * in the worst case, another thread ends up here, which * may cause us to allocate too many blocks, but this * will be caught by the loop below. */ tty_unlock(tp); tib = uma_zalloc(ttyinq_zone, M_WAITOK); tty_lock(tp); if (tty_gone(tp)) { uma_zfree(ttyinq_zone, tib); return (ENXIO); } TTYINQ_INSERT_TAIL(ti, tib); } return (0); } void ttyinq_free(struct ttyinq *ti) { struct ttyinq_block *tib; ttyinq_flush(ti); ti->ti_quota = 0; while ((tib = ti->ti_firstblock) != NULL) { TTYINQ_REMOVE_HEAD(ti); uma_zfree(ttyinq_zone, tib); } MPASS(ti->ti_nblocks == 0); } int ttyinq_read_uio(struct ttyinq *ti, struct tty *tp, struct uio *uio, size_t rlen, size_t flen) { MPASS(rlen <= uio->uio_resid); while (rlen > 0) { int error; struct ttyinq_block *tib; size_t cbegin, cend, clen; /* See if there still is data. */ if (ti->ti_begin == ti->ti_linestart) return (0); tib = ti->ti_firstblock; if (tib == NULL) return (0); /* * The end address should be the lowest of these three: * - The write pointer * - The blocksize - we can't read beyond the block * - The end address if we could perform the full read */ cbegin = ti->ti_begin; cend = MIN(MIN(ti->ti_linestart, ti->ti_begin + rlen), TTYINQ_DATASIZE); clen = cend - cbegin; MPASS(clen >= flen); rlen -= clen; /* * We can prevent buffering in some cases: * - We need to read the block until the end. * - We don't need to read the block until the end, but * there is no data beyond it, which allows us to move * the write pointer to a new block. */ if (cend == TTYINQ_DATASIZE || cend == ti->ti_end) { /* * Fast path: zero copy. Remove the first block, * so we can unlock the TTY temporarily. */ TTYINQ_REMOVE_HEAD(ti); ti->ti_begin = 0; /* * Because we remove the first block, we must * fix up the block offsets. */ #define CORRECT_BLOCK(t) do { \ if (t <= TTYINQ_DATASIZE) \ t = 0; \ else \ t -= TTYINQ_DATASIZE; \ } while (0) CORRECT_BLOCK(ti->ti_linestart); CORRECT_BLOCK(ti->ti_reprint); CORRECT_BLOCK(ti->ti_end); #undef CORRECT_BLOCK /* * Temporary unlock and copy the data to * userspace. We may need to flush trailing * bytes, like EOF characters. */ tty_unlock(tp); error = uiomove(tib->tib_data + cbegin, clen - flen, uio); tty_lock(tp); /* Block can now be readded to the list. */ TTYINQ_RECYCLE(ti, tib); } else { char ob[TTYINQ_DATASIZE - 1]; /* * Slow path: store data in a temporary buffer. */ memcpy(ob, tib->tib_data + cbegin, clen - flen); ti->ti_begin += clen; MPASS(ti->ti_begin < TTYINQ_DATASIZE); /* Temporary unlock and copy the data to userspace. */ tty_unlock(tp); error = uiomove(ob, clen - flen, uio); tty_lock(tp); } if (error != 0) return (error); if (tty_gone(tp)) return (ENXIO); } return (0); } static __inline void ttyinq_set_quotes(struct ttyinq_block *tib, size_t offset, size_t length, int value) { if (value) { /* Set the bits. */ for (; length > 0; length--, offset++) SETBIT(tib, offset); } else { /* Unset the bits. */ for (; length > 0; length--, offset++) CLRBIT(tib, offset); } } size_t ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote) { const char *cbuf = buf; struct ttyinq_block *tib; unsigned int boff; size_t l; while (nbytes > 0) { boff = ti->ti_end % TTYINQ_DATASIZE; if (ti->ti_end == 0) { /* First time we're being used or drained. */ MPASS(ti->ti_begin == 0); tib = ti->ti_firstblock; if (tib == NULL) { /* Queue has no blocks. */ break; } ti->ti_lastblock = tib; } else if (boff == 0) { /* We reached the end of this block on last write. */ tib = ti->ti_lastblock->tib_next; if (tib == NULL) { /* We've reached the watermark. */ break; } ti->ti_lastblock = tib; } else { tib = ti->ti_lastblock; } /* Don't copy more than was requested. */ l = MIN(nbytes, TTYINQ_DATASIZE - boff); MPASS(l > 0); memcpy(tib->tib_data + boff, cbuf, l); /* Set the quoting bits for the proper region. */ ttyinq_set_quotes(tib, boff, l, quote); cbuf += l; nbytes -= l; ti->ti_end += l; } return (cbuf - (const char *)buf); } int ttyinq_write_nofrag(struct ttyinq *ti, const void *buf, size_t nbytes, int quote) { size_t ret; if (ttyinq_bytesleft(ti) < nbytes) return (-1); /* We should always be able to write it back. */ ret = ttyinq_write(ti, buf, nbytes, quote); MPASS(ret == nbytes); return (0); } void ttyinq_canonicalize(struct ttyinq *ti) { ti->ti_linestart = ti->ti_reprint = ti->ti_end; ti->ti_startblock = ti->ti_reprintblock = ti->ti_lastblock; } size_t ttyinq_findchar(struct ttyinq *ti, const char *breakc, size_t maxlen, char *lastc) { struct ttyinq_block *tib = ti->ti_firstblock; unsigned int boff = ti->ti_begin; unsigned int bend = MIN(MIN(TTYINQ_DATASIZE, ti->ti_linestart), ti->ti_begin + maxlen); MPASS(maxlen > 0); if (tib == NULL) return (0); while (boff < bend) { if (strchr(breakc, tib->tib_data[boff]) && !GETBIT(tib, boff)) { *lastc = tib->tib_data[boff]; return (boff - ti->ti_begin + 1); } boff++; } /* Not found - just process the entire block. */ return (bend - ti->ti_begin); } void ttyinq_flush(struct ttyinq *ti) { struct ttyinq_block *tib; ti->ti_begin = 0; ti->ti_linestart = 0; ti->ti_reprint = 0; ti->ti_end = 0; /* Zero all data in the input queue to get rid of passwords. */ if (ttyinq_flush_secure) { for (tib = ti->ti_firstblock; tib != NULL; tib = tib->tib_next) bzero(&tib->tib_data, sizeof tib->tib_data); } } int ttyinq_peekchar(struct ttyinq *ti, char *c, int *quote) { unsigned int boff; struct ttyinq_block *tib = ti->ti_lastblock; if (ti->ti_linestart == ti->ti_end) return (-1); MPASS(ti->ti_end > 0); boff = (ti->ti_end - 1) % TTYINQ_DATASIZE; *c = tib->tib_data[boff]; *quote = GETBIT(tib, boff); return (0); } void ttyinq_unputchar(struct ttyinq *ti) { MPASS(ti->ti_linestart < ti->ti_end); if (--ti->ti_end % TTYINQ_DATASIZE == 0) { /* Roll back to the previous block. */ ti->ti_lastblock = ti->ti_lastblock->tib_prev; /* * This can only fail if we are unputchar()'ing the * first character in the queue. */ MPASS((ti->ti_lastblock == NULL) == (ti->ti_end == 0)); } } void ttyinq_reprintpos_set(struct ttyinq *ti) { ti->ti_reprint = ti->ti_end; ti->ti_reprintblock = ti->ti_lastblock; } void ttyinq_reprintpos_reset(struct ttyinq *ti) { ti->ti_reprint = ti->ti_linestart; ti->ti_reprintblock = ti->ti_startblock; } static void ttyinq_line_iterate(struct ttyinq *ti, ttyinq_line_iterator_t *iterator, void *data, unsigned int offset, struct ttyinq_block *tib) { unsigned int boff; /* Use the proper block when we're at the queue head. */ if (offset == 0) tib = ti->ti_firstblock; /* Iterate all characters and call the iterator function. */ for (; offset < ti->ti_end; offset++) { boff = offset % TTYINQ_DATASIZE; MPASS(tib != NULL); /* Call back the iterator function. */ iterator(data, tib->tib_data[boff], GETBIT(tib, boff)); /* Last byte iterated - go to the next block. */ if (boff == TTYINQ_DATASIZE - 1) tib = tib->tib_next; MPASS(tib != NULL); } } void ttyinq_line_iterate_from_linestart(struct ttyinq *ti, ttyinq_line_iterator_t *iterator, void *data) { ttyinq_line_iterate(ti, iterator, data, ti->ti_linestart, ti->ti_startblock); } void ttyinq_line_iterate_from_reprintpos(struct ttyinq *ti, ttyinq_line_iterator_t *iterator, void *data) { ttyinq_line_iterate(ti, iterator, data, ti->ti_reprint, ti->ti_reprintblock); } static void ttyinq_startup(void *dummy) { ttyinq_zone = uma_zcreate("ttyinq", sizeof(struct ttyinq_block), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(ttyinq, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyinq_startup, NULL); Index: head/sys/kern/tty_outq.c =================================================================== --- head/sys/kern/tty_outq.c (revision 326270) +++ head/sys/kern/tty_outq.c (revision 326271) @@ -1,345 +1,347 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include /* * TTY output queue buffering. * * The previous design of the TTY layer offered the so-called clists. * These clists were used for both the input queues and the output * queue. We don't use certain features on the output side, like quoting * bits for parity marking and such. This mechanism is similar to the * old clists, but only contains the features we need to buffer the * output. */ struct ttyoutq_block { struct ttyoutq_block *tob_next; char tob_data[TTYOUTQ_DATASIZE]; }; static uma_zone_t ttyoutq_zone; #define TTYOUTQ_INSERT_TAIL(to, tob) do { \ if (to->to_end == 0) { \ tob->tob_next = to->to_firstblock; \ to->to_firstblock = tob; \ } else { \ tob->tob_next = to->to_lastblock->tob_next; \ to->to_lastblock->tob_next = tob; \ } \ to->to_nblocks++; \ } while (0) #define TTYOUTQ_REMOVE_HEAD(to) do { \ to->to_firstblock = to->to_firstblock->tob_next; \ to->to_nblocks--; \ } while (0) #define TTYOUTQ_RECYCLE(to, tob) do { \ if (to->to_quota <= to->to_nblocks) \ uma_zfree(ttyoutq_zone, tob); \ else \ TTYOUTQ_INSERT_TAIL(to, tob); \ } while(0) void ttyoutq_flush(struct ttyoutq *to) { to->to_begin = 0; to->to_end = 0; } int ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t size) { struct ttyoutq_block *tob; to->to_quota = howmany(size, TTYOUTQ_DATASIZE); while (to->to_quota > to->to_nblocks) { /* * List is getting bigger. * Add new blocks to the tail of the list. * * We must unlock the TTY temporarily, because we need * to allocate memory. This won't be a problem, because * in the worst case, another thread ends up here, which * may cause us to allocate too many blocks, but this * will be caught by the loop below. */ tty_unlock(tp); tob = uma_zalloc(ttyoutq_zone, M_WAITOK); tty_lock(tp); if (tty_gone(tp)) { uma_zfree(ttyoutq_zone, tob); return (ENXIO); } TTYOUTQ_INSERT_TAIL(to, tob); } return (0); } void ttyoutq_free(struct ttyoutq *to) { struct ttyoutq_block *tob; ttyoutq_flush(to); to->to_quota = 0; while ((tob = to->to_firstblock) != NULL) { TTYOUTQ_REMOVE_HEAD(to); uma_zfree(ttyoutq_zone, tob); } MPASS(to->to_nblocks == 0); } size_t ttyoutq_read(struct ttyoutq *to, void *buf, size_t len) { char *cbuf = buf; while (len > 0) { struct ttyoutq_block *tob; size_t cbegin, cend, clen; /* See if there still is data. */ if (to->to_begin == to->to_end) break; tob = to->to_firstblock; if (tob == NULL) break; /* * The end address should be the lowest of these three: * - The write pointer * - The blocksize - we can't read beyond the block * - The end address if we could perform the full read */ cbegin = to->to_begin; cend = MIN(MIN(to->to_end, to->to_begin + len), TTYOUTQ_DATASIZE); clen = cend - cbegin; /* Copy the data out of the buffers. */ memcpy(cbuf, tob->tob_data + cbegin, clen); cbuf += clen; len -= clen; if (cend == to->to_end) { /* Read the complete queue. */ to->to_begin = 0; to->to_end = 0; } else if (cend == TTYOUTQ_DATASIZE) { /* Read the block until the end. */ TTYOUTQ_REMOVE_HEAD(to); to->to_begin = 0; to->to_end -= TTYOUTQ_DATASIZE; TTYOUTQ_RECYCLE(to, tob); } else { /* Read the block partially. */ to->to_begin += clen; } } return (cbuf - (char *)buf); } /* * An optimized version of ttyoutq_read() which can be used in pseudo * TTY drivers to directly copy data from the outq to userspace, instead * of buffering it. * * We can only copy data directly if we need to read the entire block * back to the user, because we temporarily remove the block from the * queue. Otherwise we need to copy it to a temporary buffer first, to * make sure data remains in the correct order. */ int ttyoutq_read_uio(struct ttyoutq *to, struct tty *tp, struct uio *uio) { while (uio->uio_resid > 0) { int error; struct ttyoutq_block *tob; size_t cbegin, cend, clen; /* See if there still is data. */ if (to->to_begin == to->to_end) return (0); tob = to->to_firstblock; if (tob == NULL) return (0); /* * The end address should be the lowest of these three: * - The write pointer * - The blocksize - we can't read beyond the block * - The end address if we could perform the full read */ cbegin = to->to_begin; cend = MIN(MIN(to->to_end, to->to_begin + uio->uio_resid), TTYOUTQ_DATASIZE); clen = cend - cbegin; /* * We can prevent buffering in some cases: * - We need to read the block until the end. * - We don't need to read the block until the end, but * there is no data beyond it, which allows us to move * the write pointer to a new block. */ if (cend == TTYOUTQ_DATASIZE || cend == to->to_end) { /* * Fast path: zero copy. Remove the first block, * so we can unlock the TTY temporarily. */ TTYOUTQ_REMOVE_HEAD(to); to->to_begin = 0; if (to->to_end <= TTYOUTQ_DATASIZE) to->to_end = 0; else to->to_end -= TTYOUTQ_DATASIZE; /* Temporary unlock and copy the data to userspace. */ tty_unlock(tp); error = uiomove(tob->tob_data + cbegin, clen, uio); tty_lock(tp); /* Block can now be readded to the list. */ TTYOUTQ_RECYCLE(to, tob); } else { char ob[TTYOUTQ_DATASIZE - 1]; /* * Slow path: store data in a temporary buffer. */ memcpy(ob, tob->tob_data + cbegin, clen); to->to_begin += clen; MPASS(to->to_begin < TTYOUTQ_DATASIZE); /* Temporary unlock and copy the data to userspace. */ tty_unlock(tp); error = uiomove(ob, clen, uio); tty_lock(tp); } if (error != 0) return (error); } return (0); } size_t ttyoutq_write(struct ttyoutq *to, const void *buf, size_t nbytes) { const char *cbuf = buf; struct ttyoutq_block *tob; unsigned int boff; size_t l; while (nbytes > 0) { boff = to->to_end % TTYOUTQ_DATASIZE; if (to->to_end == 0) { /* First time we're being used or drained. */ MPASS(to->to_begin == 0); tob = to->to_firstblock; if (tob == NULL) { /* Queue has no blocks. */ break; } to->to_lastblock = tob; } else if (boff == 0) { /* We reached the end of this block on last write. */ tob = to->to_lastblock->tob_next; if (tob == NULL) { /* We've reached the watermark. */ break; } to->to_lastblock = tob; } else { tob = to->to_lastblock; } /* Don't copy more than was requested. */ l = MIN(nbytes, TTYOUTQ_DATASIZE - boff); MPASS(l > 0); memcpy(tob->tob_data + boff, cbuf, l); cbuf += l; nbytes -= l; to->to_end += l; } return (cbuf - (const char *)buf); } int ttyoutq_write_nofrag(struct ttyoutq *to, const void *buf, size_t nbytes) { size_t ret; if (ttyoutq_bytesleft(to) < nbytes) return (-1); /* We should always be able to write it back. */ ret = ttyoutq_write(to, buf, nbytes); MPASS(ret == nbytes); return (0); } static void ttyoutq_startup(void *dummy) { ttyoutq_zone = uma_zcreate("ttyoutq", sizeof(struct ttyoutq_block), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(ttyoutq, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyoutq_startup, NULL); Index: head/sys/kern/tty_pts.c =================================================================== --- head/sys/kern/tty_pts.c (revision 326270) +++ head/sys/kern/tty_pts.c (revision 326271) @@ -1,869 +1,871 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* Add compatibility bits for FreeBSD. */ #define PTS_COMPAT /* Add pty(4) compat bits. */ #define PTS_EXTERNAL /* Add bits to make Linux binaries work. */ #define PTS_LINUX #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Our utmp(5) format is limited to 8-byte TTY line names. This means * we can at most allocate 1000 pseudo-terminals ("pts/999"). Allow * users to increase this number, assuming they have manually increased * UT_LINESIZE. */ static struct unrhdr *pts_pool; static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device"); /* * Per-PTS structure. * * List of locks * (t) locked by tty_lock() * (c) const until freeing */ struct pts_softc { int pts_unit; /* (c) Device unit number. */ unsigned int pts_flags; /* (t) Device flags. */ #define PTS_PKT 0x1 /* Packet mode. */ #define PTS_FINISHED 0x2 /* Return errors on read()/write(). */ char pts_pkt; /* (t) Unread packet mode data. */ struct cv pts_inwait; /* (t) Blocking write() on master. */ struct selinfo pts_inpoll; /* (t) Select queue for write(). */ struct cv pts_outwait; /* (t) Blocking read() on master. */ struct selinfo pts_outpoll; /* (t) Select queue for read(). */ #ifdef PTS_EXTERNAL struct cdev *pts_cdev; /* (c) Master device node. */ #endif /* PTS_EXTERNAL */ struct ucred *pts_cred; /* (c) Resource limit. */ }; /* * Controller-side file operations. */ static int ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; char pkt; if (uio->uio_resid == 0) return (0); tty_lock(tp); for (;;) { /* * Implement packet mode. When packet mode is turned on, * the first byte contains a bitmask of events that * occurred (start, stop, flush, window size, etc). */ if (psc->pts_flags & PTS_PKT && psc->pts_pkt) { pkt = psc->pts_pkt; psc->pts_pkt = 0; tty_unlock(tp); error = ureadc(pkt, uio); return (error); } /* * Transmit regular data. * * XXX: We shouldn't use ttydisc_getc_poll()! Even * though in this implementation, there is likely going * to be data, we should just call ttydisc_getc_uio() * and use its return value to sleep. */ if (ttydisc_getc_poll(tp)) { if (psc->pts_flags & PTS_PKT) { /* * XXX: Small race. Fortunately PTY * consumers aren't multithreaded. */ tty_unlock(tp); error = ureadc(TIOCPKT_DATA, uio); if (error) return (error); tty_lock(tp); } error = ttydisc_getc_uio(tp, uio); break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) break; /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; break; } error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx); if (error != 0) break; } tty_unlock(tp); return (error); } static int ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); char ib[256], *ibstart; size_t iblen, rintlen; int error = 0; if (uio->uio_resid == 0) return (0); for (;;) { ibstart = ib; iblen = MIN(uio->uio_resid, sizeof ib); error = uiomove(ib, iblen, uio); tty_lock(tp); if (error != 0) { iblen = 0; goto done; } /* * When possible, avoid the slow path. rint_bypass() * copies all input to the input queue at once. */ MPASS(iblen > 0); do { rintlen = ttydisc_rint_simple(tp, ibstart, iblen); ibstart += rintlen; iblen -= rintlen; if (iblen == 0) { /* All data written. */ break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) { error = EIO; goto done; } /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; goto done; } /* Wake up users on the slave side. */ ttydisc_rint_done(tp); error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx); if (error != 0) goto done; } while (iblen > 0); if (uio->uio_resid == 0) break; tty_unlock(tp); } done: ttydisc_rint_done(tp); tty_unlock(tp); /* * Don't account for the part of the buffer that we couldn't * pass to the TTY. */ uio->uio_resid += iblen; return (error); } static int ptsdev_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0, sig; switch (cmd) { case FIODTYPE: *(int *)data = D_TTY; return (0); case FIONBIO: /* This device supports non-blocking operation. */ return (0); case FIONREAD: tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Force read() to be called. */ *(int *)data = 1; } else { *(int *)data = ttydisc_getc_poll(tp); } tty_unlock(tp); return (0); case FIODGNAME: { struct fiodgname_arg *fgn; const char *p; int i; /* Reverse device name lookups, for ptsname() and ttyname(). */ fgn = data; p = tty_devname(tp); i = strlen(p) + 1; if (i > fgn->len) return (EINVAL); return copyout(p, fgn->buf, i); } /* * We need to implement TIOCGPGRP and TIOCGSID here again. When * called on the pseudo-terminal master, it should not check if * the terminal is the foreground terminal of the calling * process. * * TIOCGETA is also implemented here. Various Linux PTY routines * often call isatty(), which is implemented by tcgetattr(). */ #ifdef PTS_LINUX case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ tty_lock(tp); *(struct termios*)data = tp->t_termios; tty_unlock(tp); return (0); #endif /* PTS_LINUX */ case TIOCSETAF: case TIOCSETAW: /* * We must make sure we turn tcsetattr() calls of TCSAFLUSH and * TCSADRAIN into something different. If an application would * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may * deadlock waiting for all data to be read. */ cmd = TIOCSETA; break; #if defined(PTS_COMPAT) || defined(PTS_LINUX) case TIOCGPTN: /* * Get the device unit number. */ if (psc->pts_unit < 0) return (ENOTTY); *(unsigned int *)data = psc->pts_unit; return (0); #endif /* PTS_COMPAT || PTS_LINUX */ case TIOCGPGRP: /* Get the foreground process group ID. */ tty_lock(tp); if (tp->t_pgrp != NULL) *(int *)data = tp->t_pgrp->pg_id; else *(int *)data = NO_PID; tty_unlock(tp); return (0); case TIOCGSID: /* Get the session leader process ID. */ tty_lock(tp); if (tp->t_session == NULL) error = ENOTTY; else *(int *)data = tp->t_session->s_sid; tty_unlock(tp); return (error); case TIOCPTMASTER: /* Yes, we are a pseudo-terminal master. */ return (0); case TIOCSIG: /* Signal the foreground process group. */ sig = *(int *)data; if (sig < 1 || sig >= NSIG) return (EINVAL); tty_lock(tp); tty_signal_pgrp(tp, sig); tty_unlock(tp); return (0); case TIOCPKT: /* Enable/disable packet mode. */ tty_lock(tp); if (*(int *)data) psc->pts_flags |= PTS_PKT; else psc->pts_flags &= ~PTS_PKT; tty_unlock(tp); return (0); } /* Just redirect this ioctl to the slave device. */ tty_lock(tp); error = tty_ioctl(tp, cmd, data, fp->f_flag, td); tty_unlock(tp); if (error == ENOIOCTL) error = ENOTTY; return (error); } static int ptsdev_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int revents = 0; tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Slave device is not opened. */ tty_unlock(tp); return ((events & (POLLIN|POLLRDNORM)) | POLLHUP); } if (events & (POLLIN|POLLRDNORM)) { /* See if we can getc something. */ if (ttydisc_getc_poll(tp) || (psc->pts_flags & PTS_PKT && psc->pts_pkt)) revents |= events & (POLLIN|POLLRDNORM); } if (events & (POLLOUT|POLLWRNORM)) { /* See if we can rint something. */ if (ttydisc_rint_poll(tp)) revents |= events & (POLLOUT|POLLWRNORM); } /* * No need to check for POLLHUP here. This device cannot be used * as a callout device, which means we always have a carrier, * because the master is. */ if (revents == 0) { /* * This code might look misleading, but the naming of * poll events on this side is the opposite of the slave * device. */ if (events & (POLLIN|POLLRDNORM)) selrecord(td, &psc->pts_outpoll); if (events & (POLLOUT|POLLWRNORM)) selrecord(td, &psc->pts_inpoll); } tty_unlock(tp); return (revents); } /* * kqueue support. */ static void pts_kqops_read_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_outpoll.si_note, kn, 0); } static int pts_kqops_read_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_getc_poll(tp); return (kn->kn_data > 0); } } static void pts_kqops_write_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_inpoll.si_note, kn, 0); } static int pts_kqops_write_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_rint_poll(tp); return (kn->kn_data > 0); } } static struct filterops pts_kqops_read = { .f_isfd = 1, .f_detach = pts_kqops_read_detach, .f_event = pts_kqops_read_event, }; static struct filterops pts_kqops_write = { .f_isfd = 1, .f_detach = pts_kqops_write_detach, .f_event = pts_kqops_write_event, }; static int ptsdev_kqfilter(struct file *fp, struct knote *kn) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; tty_lock(tp); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pts_kqops_read; knlist_add(&psc->pts_outpoll.si_note, kn, 1); break; case EVFILT_WRITE: kn->kn_fop = &pts_kqops_write; knlist_add(&psc->pts_inpoll.si_note, kn, 1); break; default: error = EINVAL; break; } tty_unlock(tp); return (error); } static int ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; #ifdef PTS_EXTERNAL struct pts_softc *psc = tty_softc(tp); #endif /* PTS_EXTERNAL */ struct cdev *dev = tp->t_dev; /* * According to POSIX, we must implement an fstat(). This also * makes this implementation compatible with Linux binaries, * because Linux calls fstat() on the pseudo-terminal master to * obtain st_rdev. * * XXX: POSIX also mentions we must fill in st_dev, but how? */ bzero(sb, sizeof *sb); #ifdef PTS_EXTERNAL if (psc->pts_cdev != NULL) sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev); else #endif /* PTS_EXTERNAL */ sb->st_ino = sb->st_rdev = tty_udev(tp); sb->st_atim = dev->si_atime; sb->st_ctim = dev->si_ctime; sb->st_mtim = dev->si_mtime; sb->st_uid = dev->si_uid; sb->st_gid = dev->si_gid; sb->st_mode = dev->si_mode | S_IFCHR; return (0); } static int ptsdev_close(struct file *fp, struct thread *td) { struct tty *tp = fp->f_data; /* Deallocate TTY device. */ tty_lock(tp); tty_rel_gone(tp); /* * Open of /dev/ptmx or /dev/ptyXX changes the type of file * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode * use count, we need to decrement it, and possibly do other * required cleanup. */ if (fp->f_vnode != NULL) return (vnops.fo_close(fp, td)); return (0); } static int ptsdev_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct tty *tp; kif->kf_type = KF_TYPE_PTS; tp = fp->f_data; kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp); kif->kf_un.kf_pts.kf_pts_dev_freebsd11 = kif->kf_un.kf_pts.kf_pts_dev; /* truncate */ strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path)); return (0); } static struct fileops ptsdev_ops = { .fo_read = ptsdev_read, .fo_write = ptsdev_write, .fo_truncate = invfo_truncate, .fo_ioctl = ptsdev_ioctl, .fo_poll = ptsdev_poll, .fo_kqfilter = ptsdev_kqfilter, .fo_stat = ptsdev_stat, .fo_close = ptsdev_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ptsdev_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; /* * Driver-side hooks. */ static void ptsdrv_outwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_outwait); selwakeup(&psc->pts_outpoll); KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0); } static void ptsdrv_inwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_inwait); selwakeup(&psc->pts_inpoll); KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0); } static int ptsdrv_open(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); psc->pts_flags &= ~PTS_FINISHED; return (0); } static void ptsdrv_close(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); /* Wake up any blocked readers/writers. */ psc->pts_flags |= PTS_FINISHED; ptsdrv_outwakeup(tp); ptsdrv_inwakeup(tp); } static void ptsdrv_pktnotify(struct tty *tp, char event) { struct pts_softc *psc = tty_softc(tp); /* * Clear conflicting flags. */ switch (event) { case TIOCPKT_STOP: psc->pts_pkt &= ~TIOCPKT_START; break; case TIOCPKT_START: psc->pts_pkt &= ~TIOCPKT_STOP; break; case TIOCPKT_NOSTOP: psc->pts_pkt &= ~TIOCPKT_DOSTOP; break; case TIOCPKT_DOSTOP: psc->pts_pkt &= ~TIOCPKT_NOSTOP; break; } psc->pts_pkt |= event; ptsdrv_outwakeup(tp); } static void ptsdrv_free(void *softc) { struct pts_softc *psc = softc; /* Make device number available again. */ if (psc->pts_unit >= 0) free_unr(pts_pool, psc->pts_unit); chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0); racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1); crfree(psc->pts_cred); seldrain(&psc->pts_inpoll); seldrain(&psc->pts_outpoll); knlist_destroy(&psc->pts_inpoll.si_note); knlist_destroy(&psc->pts_outpoll.si_note); #ifdef PTS_EXTERNAL /* Destroy master device as well. */ if (psc->pts_cdev != NULL) destroy_dev_sched(psc->pts_cdev); #endif /* PTS_EXTERNAL */ free(psc, M_PTS); } static struct ttydevsw pts_class = { .tsw_flags = TF_NOPREFIX, .tsw_outwakeup = ptsdrv_outwakeup, .tsw_inwakeup = ptsdrv_inwakeup, .tsw_open = ptsdrv_open, .tsw_close = ptsdrv_close, .tsw_pktnotify = ptsdrv_pktnotify, .tsw_free = ptsdrv_free, }; #ifndef PTS_EXTERNAL static #endif /* !PTS_EXTERNAL */ int pts_alloc(int fflags, struct thread *td, struct file *fp) { int unit, ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); error = racct_add(p, RACCT_NPTS, 1); if (error != 0) { PROC_UNLOCK(p); return (EAGAIN); } ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS)); if (!ok) { racct_sub(p, RACCT_NPTS, 1); PROC_UNLOCK(p); return (EAGAIN); } PROC_UNLOCK(p); /* Try to allocate a new pts unit number. */ unit = alloc_unr(pts_pool); if (unit < 0) { racct_sub(p, RACCT_NPTS, 1); chgptscnt(cred->cr_ruidinfo, -1, 0); return (EAGAIN); } /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "ptsin"); cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = unit; psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #ifdef PTS_EXTERNAL int pts_alloc_external(int fflags, struct thread *td, struct file *fp, struct cdev *dev, const char *name) { int ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); error = racct_add(p, RACCT_NPTS, 1); if (error != 0) { PROC_UNLOCK(p); return (EAGAIN); } ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS)); if (!ok) { racct_sub(p, RACCT_NPTS, 1); PROC_UNLOCK(p); return (EAGAIN); } PROC_UNLOCK(p); /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "ptsin"); cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = -1; psc->pts_cdev = dev; psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "%s", name); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #endif /* PTS_EXTERNAL */ int sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap) { int error, fd; struct file *fp; /* * POSIX states it's unspecified when other flags are passed. We * don't allow this. */ if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC)) return (EINVAL); error = falloc(td, &fp, &fd, uap->flags); if (error) return (error); /* Allocate the actual pseudo-TTY. */ error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp); if (error != 0) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } /* Pass it back to userspace. */ td->td_retval[0] = fd; fdrop(fp, td); return (0); } static void pts_init(void *unused) { pts_pool = new_unrhdr(0, INT_MAX, NULL); } SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL); Index: head/sys/kern/tty_tty.c =================================================================== --- head/sys/kern/tty_tty.c (revision 326270) +++ head/sys/kern/tty_tty.c (revision 326271) @@ -1,96 +1,98 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003 Poul-Henning Kamp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include static d_open_t cttyopen; static struct cdevsw ctty_cdevsw = { .d_version = D_VERSION, .d_open = cttyopen, .d_name = "ctty", }; static struct cdev *ctty; static int cttyopen(struct cdev *dev, int flag, int mode, struct thread *td) { return (ENXIO); } static void ctty_clone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { struct proc *p; if (*dev != NULL) return; if (strcmp(name, "tty")) return; p = curproc; sx_sunlock(&clone_drain_lock); sx_slock(&proctree_lock); sx_slock(&clone_drain_lock); dev_lock(); if (!(p->p_flag & P_CONTROLT)) *dev = ctty; else if (p->p_session->s_ttyvp == NULL) *dev = ctty; else if (p->p_session->s_ttyvp->v_type == VBAD || p->p_session->s_ttyvp->v_rdev == NULL) { /* e.g. s_ttyvp was revoked */ *dev = ctty; } else *dev = p->p_session->s_ttyvp->v_rdev; dev_refl(*dev); dev_unlock(); sx_sunlock(&proctree_lock); } static void ctty_drvinit(void *unused) { EVENTHANDLER_REGISTER(dev_clone, ctty_clone, 0, 1000); ctty = make_dev_credf(MAKEDEV_ETERNAL, &ctty_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "ctty"); } SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,ctty_drvinit,NULL); Index: head/sys/kern/tty_ttydisc.c =================================================================== --- head/sys/kern/tty_ttydisc.c (revision 326270) +++ head/sys/kern/tty_ttydisc.c (revision 326271) @@ -1,1265 +1,1267 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include /* * Standard TTYDISC `termios' line discipline. */ /* Statistics. */ static unsigned long tty_nin = 0; SYSCTL_ULONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD, &tty_nin, 0, "Total amount of bytes received"); static unsigned long tty_nout = 0; SYSCTL_ULONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD, &tty_nout, 0, "Total amount of bytes transmitted"); /* termios comparison macro's. */ #define CMP_CC(v,c) (tp->t_termios.c_cc[v] != _POSIX_VDISABLE && \ tp->t_termios.c_cc[v] == (c)) #define CMP_FLAG(field,opt) (tp->t_termios.c_ ## field ## flag & (opt)) /* Characters that cannot be modified through c_cc. */ #define CTAB '\t' #define CNL '\n' #define CCR '\r' /* Character is a control character. */ #define CTL_VALID(c) ((c) == 0x7f || (unsigned char)(c) < 0x20) /* Control character should be processed on echo. */ #define CTL_ECHO(c,q) (!(q) && ((c) == CERASE2 || (c) == CTAB || \ (c) == CNL || (c) == CCR)) /* Control character should be printed using ^X notation. */ #define CTL_PRINT(c,q) ((c) == 0x7f || ((unsigned char)(c) < 0x20 && \ ((q) || ((c) != CTAB && (c) != CNL)))) /* Character is whitespace. */ #define CTL_WHITE(c) ((c) == ' ' || (c) == CTAB) /* Character is alphanumeric. */ #define CTL_ALNUM(c) (((c) >= '0' && (c) <= '9') || \ ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) #define TTY_STACKBUF 256 void ttydisc_open(struct tty *tp) { ttydisc_optimize(tp); } void ttydisc_close(struct tty *tp) { /* Clean up our flags when leaving the discipline. */ tp->t_flags &= ~(TF_STOPPED|TF_HIWAT|TF_ZOMBIE); /* * POSIX states that we must drain output and flush input on * last close. Draining has already been done if possible. */ tty_flush(tp, FREAD | FWRITE); if (ttyhook_hashook(tp, close)) ttyhook_close(tp); } static int ttydisc_read_canonical(struct tty *tp, struct uio *uio, int ioflag) { char breakc[4] = { CNL }; /* enough to hold \n, VEOF and VEOL. */ int error; size_t clen, flen = 0, n = 1; unsigned char lastc = _POSIX_VDISABLE; #define BREAK_ADD(c) do { \ if (tp->t_termios.c_cc[c] != _POSIX_VDISABLE) \ breakc[n++] = tp->t_termios.c_cc[c]; \ } while (0) /* Determine which characters we should trigger on. */ BREAK_ADD(VEOF); BREAK_ADD(VEOL); #undef BREAK_ADD breakc[n] = '\0'; do { error = tty_wait_background(tp, curthread, SIGTTIN); if (error) return (error); /* * Quite a tricky case: unlike the old TTY * implementation, this implementation copies data back * to userspace in large chunks. Unfortunately, we can't * calculate the line length on beforehand if it crosses * ttyinq_block boundaries, because multiple reads could * then make this code read beyond the newline. * * This is why we limit the read to: * - The size the user has requested * - The blocksize (done in tty_inq.c) * - The amount of bytes until the newline * * This causes the line length to be recalculated after * each block has been copied to userspace. This will * cause the TTY layer to return data in chunks using * the blocksize (except the first and last blocks). */ clen = ttyinq_findchar(&tp->t_inq, breakc, uio->uio_resid, &lastc); /* No more data. */ if (clen == 0) { if (tp->t_flags & TF_ZOMBIE) return (0); else if (ioflag & IO_NDELAY) return (EWOULDBLOCK); error = tty_wait(tp, &tp->t_inwait); if (error) return (error); continue; } /* Don't send the EOF char back to userspace. */ if (CMP_CC(VEOF, lastc)) flen = 1; MPASS(flen <= clen); /* Read and throw away the EOF character. */ error = ttyinq_read_uio(&tp->t_inq, tp, uio, clen, flen); if (error) return (error); } while (uio->uio_resid > 0 && lastc == _POSIX_VDISABLE); return (0); } static int ttydisc_read_raw_no_timer(struct tty *tp, struct uio *uio, int ioflag) { size_t vmin = tp->t_termios.c_cc[VMIN]; ssize_t oresid = uio->uio_resid; int error; MPASS(tp->t_termios.c_cc[VTIME] == 0); /* * This routine implements the easy cases of read()s while in * non-canonical mode, namely case B and D, where we don't have * any timers at all. */ for (;;) { error = tty_wait_background(tp, curthread, SIGTTIN); if (error) return (error); error = ttyinq_read_uio(&tp->t_inq, tp, uio, uio->uio_resid, 0); if (error) return (error); if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin) return (0); /* We have to wait for more. */ if (tp->t_flags & TF_ZOMBIE) return (0); else if (ioflag & IO_NDELAY) return (EWOULDBLOCK); error = tty_wait(tp, &tp->t_inwait); if (error) return (error); } } static int ttydisc_read_raw_read_timer(struct tty *tp, struct uio *uio, int ioflag, int oresid) { size_t vmin = MAX(tp->t_termios.c_cc[VMIN], 1); unsigned int vtime = tp->t_termios.c_cc[VTIME]; struct timeval end, now, left; int error, hz; MPASS(tp->t_termios.c_cc[VTIME] != 0); /* Determine when the read should be expired. */ end.tv_sec = vtime / 10; end.tv_usec = (vtime % 10) * 100000; getmicrotime(&now); timevaladd(&end, &now); for (;;) { error = tty_wait_background(tp, curthread, SIGTTIN); if (error) return (error); error = ttyinq_read_uio(&tp->t_inq, tp, uio, uio->uio_resid, 0); if (error) return (error); if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin) return (0); /* Calculate how long we should wait. */ getmicrotime(&now); if (timevalcmp(&now, &end, >)) return (0); left = end; timevalsub(&left, &now); hz = tvtohz(&left); /* * We have to wait for more. If the timer expires, we * should return a 0-byte read. */ if (tp->t_flags & TF_ZOMBIE) return (0); else if (ioflag & IO_NDELAY) return (EWOULDBLOCK); error = tty_timedwait(tp, &tp->t_inwait, hz); if (error) return (error == EWOULDBLOCK ? 0 : error); } return (0); } static int ttydisc_read_raw_interbyte_timer(struct tty *tp, struct uio *uio, int ioflag) { size_t vmin = tp->t_termios.c_cc[VMIN]; ssize_t oresid = uio->uio_resid; int error; MPASS(tp->t_termios.c_cc[VMIN] != 0); MPASS(tp->t_termios.c_cc[VTIME] != 0); /* * When using the interbyte timer, the timer should be started * after the first byte has been received. We just call into the * generic read timer code after we've received the first byte. */ for (;;) { error = tty_wait_background(tp, curthread, SIGTTIN); if (error) return (error); error = ttyinq_read_uio(&tp->t_inq, tp, uio, uio->uio_resid, 0); if (error) return (error); if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin) return (0); /* * Not enough data, but we did receive some, which means * we'll now start using the interbyte timer. */ if (oresid != uio->uio_resid) break; /* We have to wait for more. */ if (tp->t_flags & TF_ZOMBIE) return (0); else if (ioflag & IO_NDELAY) return (EWOULDBLOCK); error = tty_wait(tp, &tp->t_inwait); if (error) return (error); } return ttydisc_read_raw_read_timer(tp, uio, ioflag, oresid); } int ttydisc_read(struct tty *tp, struct uio *uio, int ioflag) { int error; tty_lock_assert(tp, MA_OWNED); if (uio->uio_resid == 0) return (0); if (CMP_FLAG(l, ICANON)) error = ttydisc_read_canonical(tp, uio, ioflag); else if (tp->t_termios.c_cc[VTIME] == 0) error = ttydisc_read_raw_no_timer(tp, uio, ioflag); else if (tp->t_termios.c_cc[VMIN] == 0) error = ttydisc_read_raw_read_timer(tp, uio, ioflag, uio->uio_resid); else error = ttydisc_read_raw_interbyte_timer(tp, uio, ioflag); if (ttyinq_bytesleft(&tp->t_inq) >= tp->t_inlow || ttyinq_bytescanonicalized(&tp->t_inq) == 0) { /* Unset the input watermark when we've got enough space. */ tty_hiwat_in_unblock(tp); } return (error); } static __inline unsigned int ttydisc_findchar(const char *obstart, unsigned int oblen) { const char *c = obstart; while (oblen--) { if (CTL_VALID(*c)) break; c++; } return (c - obstart); } static int ttydisc_write_oproc(struct tty *tp, char c) { unsigned int scnt, error; MPASS(CMP_FLAG(o, OPOST)); MPASS(CTL_VALID(c)); #define PRINT_NORMAL() ttyoutq_write_nofrag(&tp->t_outq, &c, 1) switch (c) { case CEOF: /* End-of-text dropping. */ if (CMP_FLAG(o, ONOEOT)) return (0); return PRINT_NORMAL(); case CERASE2: /* Handle backspace to fix tab expansion. */ if (PRINT_NORMAL() != 0) return (-1); if (tp->t_column > 0) tp->t_column--; return (0); case CTAB: /* Tab expansion. */ scnt = 8 - (tp->t_column & 7); if (CMP_FLAG(o, TAB3)) { error = ttyoutq_write_nofrag(&tp->t_outq, " ", scnt); } else { error = PRINT_NORMAL(); } if (error) return (-1); tp->t_column += scnt; MPASS((tp->t_column % 8) == 0); return (0); case CNL: /* Newline conversion. */ if (CMP_FLAG(o, ONLCR)) { /* Convert \n to \r\n. */ error = ttyoutq_write_nofrag(&tp->t_outq, "\r\n", 2); } else { error = PRINT_NORMAL(); } if (error) return (-1); if (CMP_FLAG(o, ONLCR|ONLRET)) { tp->t_column = tp->t_writepos = 0; ttyinq_reprintpos_set(&tp->t_inq); } return (0); case CCR: /* Carriage return to newline conversion. */ if (CMP_FLAG(o, OCRNL)) c = CNL; /* Omit carriage returns on column 0. */ if (CMP_FLAG(o, ONOCR) && tp->t_column == 0) return (0); if (PRINT_NORMAL() != 0) return (-1); tp->t_column = tp->t_writepos = 0; ttyinq_reprintpos_set(&tp->t_inq); return (0); } /* * Invisible control character. Print it, but don't * increase the column count. */ return PRINT_NORMAL(); #undef PRINT_NORMAL } /* * Just like the old TTY implementation, we need to copy data in chunks * into a temporary buffer. One of the reasons why we need to do this, * is because output processing (only TAB3 though) may allow the buffer * to grow eight times. */ int ttydisc_write(struct tty *tp, struct uio *uio, int ioflag) { char ob[TTY_STACKBUF]; char *obstart; int error = 0; unsigned int oblen = 0; tty_lock_assert(tp, MA_OWNED); if (tp->t_flags & TF_ZOMBIE) return (EIO); /* * We don't need to check whether the process is the foreground * process group or if we have a carrier. This is already done * in ttydev_write(). */ while (uio->uio_resid > 0) { unsigned int nlen; MPASS(oblen == 0); /* Step 1: read data. */ obstart = ob; nlen = MIN(uio->uio_resid, sizeof ob); tty_unlock(tp); error = uiomove(ob, nlen, uio); tty_lock(tp); if (error != 0) break; oblen = nlen; if (tty_gone(tp)) { error = ENXIO; break; } MPASS(oblen > 0); /* Step 2: process data. */ do { unsigned int plen, wlen; /* Search for special characters for post processing. */ if (CMP_FLAG(o, OPOST)) { plen = ttydisc_findchar(obstart, oblen); } else { plen = oblen; } if (plen == 0) { /* * We're going to process a character * that needs processing */ if (ttydisc_write_oproc(tp, *obstart) == 0) { obstart++; oblen--; tp->t_writepos = tp->t_column; ttyinq_reprintpos_set(&tp->t_inq); continue; } } else { /* We're going to write regular data. */ wlen = ttyoutq_write(&tp->t_outq, obstart, plen); obstart += wlen; oblen -= wlen; tp->t_column += wlen; tp->t_writepos = tp->t_column; ttyinq_reprintpos_set(&tp->t_inq); if (wlen == plen) continue; } /* Watermark reached. Try to sleep. */ tp->t_flags |= TF_HIWAT_OUT; if (ioflag & IO_NDELAY) { error = EWOULDBLOCK; goto done; } /* * The driver may write back the data * synchronously. Be sure to check the high * water mark before going to sleep. */ ttydevsw_outwakeup(tp); if ((tp->t_flags & TF_HIWAT_OUT) == 0) continue; error = tty_wait(tp, &tp->t_outwait); if (error) goto done; if (tp->t_flags & TF_ZOMBIE) { error = EIO; goto done; } } while (oblen > 0); } done: if (!tty_gone(tp)) ttydevsw_outwakeup(tp); /* * Add the amount of bytes that we didn't process back to the * uio counters. We need to do this to make sure write() doesn't * count the bytes we didn't store in the queue. */ uio->uio_resid += oblen; return (error); } void ttydisc_optimize(struct tty *tp) { tty_lock_assert(tp, MA_OWNED); if (ttyhook_hashook(tp, rint_bypass)) { tp->t_flags |= TF_BYPASS; } else if (ttyhook_hashook(tp, rint)) { tp->t_flags &= ~TF_BYPASS; } else if (!CMP_FLAG(i, ICRNL|IGNCR|IMAXBEL|INLCR|ISTRIP|IXON) && (!CMP_FLAG(i, BRKINT) || CMP_FLAG(i, IGNBRK)) && (!CMP_FLAG(i, PARMRK) || CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) && !CMP_FLAG(l, ECHO|ICANON|IEXTEN|ISIG|PENDIN)) { tp->t_flags |= TF_BYPASS; } else { tp->t_flags &= ~TF_BYPASS; } } void ttydisc_modem(struct tty *tp, int open) { tty_lock_assert(tp, MA_OWNED); if (open) cv_broadcast(&tp->t_dcdwait); /* * Ignore modem status lines when CLOCAL is turned on, but don't * enter the zombie state when the TTY isn't opened, because * that would cause the TTY to be in zombie state after being * opened. */ if (!tty_opened(tp) || CMP_FLAG(c, CLOCAL)) return; if (open == 0) { /* * Lost carrier. */ tp->t_flags |= TF_ZOMBIE; tty_signal_sessleader(tp, SIGHUP); tty_flush(tp, FREAD|FWRITE); } else { /* * Carrier is back again. */ /* XXX: what should we do here? */ } } static int ttydisc_echo_force(struct tty *tp, char c, int quote) { if (CMP_FLAG(o, OPOST) && CTL_ECHO(c, quote)) { /* * Only perform postprocessing when OPOST is turned on * and the character is an unquoted BS/TB/NL/CR. */ return ttydisc_write_oproc(tp, c); } else if (CMP_FLAG(l, ECHOCTL) && CTL_PRINT(c, quote)) { /* * Only use ^X notation when ECHOCTL is turned on and * we've got an quoted control character. * * Print backspaces when echoing an end-of-file. */ char ob[4] = "^?\b\b"; /* Print ^X notation. */ if (c != 0x7f) ob[1] = c + 'A' - 1; if (!quote && CMP_CC(VEOF, c)) { return ttyoutq_write_nofrag(&tp->t_outq, ob, 4); } else { tp->t_column += 2; return ttyoutq_write_nofrag(&tp->t_outq, ob, 2); } } else { /* Can just be printed. */ tp->t_column++; return ttyoutq_write_nofrag(&tp->t_outq, &c, 1); } } static int ttydisc_echo(struct tty *tp, char c, int quote) { /* * Only echo characters when ECHO is turned on, or ECHONL when * the character is an unquoted newline. */ if (!CMP_FLAG(l, ECHO) && (!CMP_FLAG(l, ECHONL) || c != CNL || quote)) return (0); return ttydisc_echo_force(tp, c, quote); } static void ttydisc_reprint_char(void *d, char c, int quote) { struct tty *tp = d; ttydisc_echo(tp, c, quote); } static void ttydisc_reprint(struct tty *tp) { cc_t c; /* Print ^R\n, followed by the line. */ c = tp->t_termios.c_cc[VREPRINT]; if (c != _POSIX_VDISABLE) ttydisc_echo(tp, c, 0); ttydisc_echo(tp, CNL, 0); ttyinq_reprintpos_reset(&tp->t_inq); ttyinq_line_iterate_from_linestart(&tp->t_inq, ttydisc_reprint_char, tp); } struct ttydisc_recalc_length { struct tty *tp; unsigned int curlen; }; static void ttydisc_recalc_charlength(void *d, char c, int quote) { struct ttydisc_recalc_length *data = d; struct tty *tp = data->tp; if (CTL_PRINT(c, quote)) { if (CMP_FLAG(l, ECHOCTL)) data->curlen += 2; } else if (c == CTAB) { data->curlen += 8 - (data->curlen & 7); } else { data->curlen++; } } static unsigned int ttydisc_recalc_linelength(struct tty *tp) { struct ttydisc_recalc_length data = { tp, tp->t_writepos }; ttyinq_line_iterate_from_reprintpos(&tp->t_inq, ttydisc_recalc_charlength, &data); return (data.curlen); } static int ttydisc_rubchar(struct tty *tp) { char c; int quote; unsigned int prevpos, tablen; if (ttyinq_peekchar(&tp->t_inq, &c, "e) != 0) return (-1); ttyinq_unputchar(&tp->t_inq); if (CMP_FLAG(l, ECHO)) { /* * Remove the character from the screen. This is even * safe for characters that span multiple characters * (tabs, quoted, etc). */ if (tp->t_writepos >= tp->t_column) { /* Retype the sentence. */ ttydisc_reprint(tp); } else if (CMP_FLAG(l, ECHOE)) { if (CTL_PRINT(c, quote)) { /* Remove ^X formatted chars. */ if (CMP_FLAG(l, ECHOCTL)) { tp->t_column -= 2; ttyoutq_write_nofrag(&tp->t_outq, "\b\b \b\b", 6); } } else if (c == ' ') { /* Space character needs no rubbing. */ tp->t_column -= 1; ttyoutq_write_nofrag(&tp->t_outq, "\b", 1); } else if (c == CTAB) { /* * Making backspace work with tabs is * quite hard. Recalculate the length of * this character and remove it. * * Because terminal settings could be * changed while the line is being * inserted, the calculations don't have * to be correct. Make sure we keep the * tab length within proper bounds. */ prevpos = ttydisc_recalc_linelength(tp); if (prevpos >= tp->t_column) tablen = 1; else tablen = tp->t_column - prevpos; if (tablen > 8) tablen = 8; tp->t_column = prevpos; ttyoutq_write_nofrag(&tp->t_outq, "\b\b\b\b\b\b\b\b", tablen); return (0); } else { /* * Remove a regular character by * punching a space over it. */ tp->t_column -= 1; ttyoutq_write_nofrag(&tp->t_outq, "\b \b", 3); } } else { /* Don't print spaces. */ ttydisc_echo(tp, tp->t_termios.c_cc[VERASE], 0); } } return (0); } static void ttydisc_rubword(struct tty *tp) { char c; int quote, alnum; /* Strip whitespace first. */ for (;;) { if (ttyinq_peekchar(&tp->t_inq, &c, "e) != 0) return; if (!CTL_WHITE(c)) break; ttydisc_rubchar(tp); } /* * Record whether the last character from the previous iteration * was alphanumeric or not. We need this to implement ALTWERASE. */ alnum = CTL_ALNUM(c); for (;;) { ttydisc_rubchar(tp); if (ttyinq_peekchar(&tp->t_inq, &c, "e) != 0) return; if (CTL_WHITE(c)) return; if (CMP_FLAG(l, ALTWERASE) && CTL_ALNUM(c) != alnum) return; } } int ttydisc_rint(struct tty *tp, char c, int flags) { int signal, quote = 0; char ob[3] = { 0xff, 0x00 }; size_t ol; tty_lock_assert(tp, MA_OWNED); atomic_add_long(&tty_nin, 1); if (ttyhook_hashook(tp, rint)) return ttyhook_rint(tp, c, flags); if (tp->t_flags & TF_BYPASS) goto processed; if (flags) { if (flags & TRE_BREAK) { if (CMP_FLAG(i, IGNBRK)) { /* Ignore break characters. */ return (0); } else if (CMP_FLAG(i, BRKINT)) { /* Generate SIGINT on break. */ tty_flush(tp, FREAD|FWRITE); tty_signal_pgrp(tp, SIGINT); return (0); } else { /* Just print it. */ goto parmrk; } } else if (flags & TRE_FRAMING || (flags & TRE_PARITY && CMP_FLAG(i, INPCK))) { if (CMP_FLAG(i, IGNPAR)) { /* Ignore bad characters. */ return (0); } else { /* Just print it. */ goto parmrk; } } } /* Allow any character to perform a wakeup. */ if (CMP_FLAG(i, IXANY)) tp->t_flags &= ~TF_STOPPED; /* Remove the top bit. */ if (CMP_FLAG(i, ISTRIP)) c &= ~0x80; /* Skip input processing when we want to print it literally. */ if (tp->t_flags & TF_LITERAL) { tp->t_flags &= ~TF_LITERAL; quote = 1; goto processed; } /* Special control characters that are implementation dependent. */ if (CMP_FLAG(l, IEXTEN)) { /* Accept the next character as literal. */ if (CMP_CC(VLNEXT, c)) { if (CMP_FLAG(l, ECHO)) { if (CMP_FLAG(l, ECHOE)) ttyoutq_write_nofrag(&tp->t_outq, "^\b", 2); else ttydisc_echo(tp, c, 0); } tp->t_flags |= TF_LITERAL; return (0); } } /* * Handle signal processing. */ if (CMP_FLAG(l, ISIG)) { if (CMP_FLAG(l, ICANON|IEXTEN) == (ICANON|IEXTEN)) { if (CMP_CC(VSTATUS, c)) { tty_signal_pgrp(tp, SIGINFO); return (0); } } /* * When compared to the old implementation, this * implementation also flushes the output queue. POSIX * is really brief about this, but does makes us assume * we have to do so. */ signal = 0; if (CMP_CC(VINTR, c)) { signal = SIGINT; } else if (CMP_CC(VQUIT, c)) { signal = SIGQUIT; } else if (CMP_CC(VSUSP, c)) { signal = SIGTSTP; } if (signal != 0) { /* * Echo the character before signalling the * processes. */ if (!CMP_FLAG(l, NOFLSH)) tty_flush(tp, FREAD|FWRITE); ttydisc_echo(tp, c, 0); tty_signal_pgrp(tp, signal); return (0); } } /* * Handle start/stop characters. */ if (CMP_FLAG(i, IXON)) { if (CMP_CC(VSTOP, c)) { /* Stop it if we aren't stopped yet. */ if ((tp->t_flags & TF_STOPPED) == 0) { tp->t_flags |= TF_STOPPED; return (0); } /* * Fallthrough: * When VSTART == VSTOP, we should make this key * toggle it. */ if (!CMP_CC(VSTART, c)) return (0); } if (CMP_CC(VSTART, c)) { tp->t_flags &= ~TF_STOPPED; return (0); } } /* Conversion of CR and NL. */ switch (c) { case CCR: if (CMP_FLAG(i, IGNCR)) return (0); if (CMP_FLAG(i, ICRNL)) c = CNL; break; case CNL: if (CMP_FLAG(i, INLCR)) c = CCR; break; } /* Canonical line editing. */ if (CMP_FLAG(l, ICANON)) { if (CMP_CC(VERASE, c) || CMP_CC(VERASE2, c)) { ttydisc_rubchar(tp); return (0); } else if (CMP_CC(VKILL, c)) { while (ttydisc_rubchar(tp) == 0); return (0); } else if (CMP_FLAG(l, IEXTEN)) { if (CMP_CC(VWERASE, c)) { ttydisc_rubword(tp); return (0); } else if (CMP_CC(VREPRINT, c)) { ttydisc_reprint(tp); return (0); } } } processed: if (CMP_FLAG(i, PARMRK) && (unsigned char)c == 0xff) { /* Print 0xff 0xff. */ ob[1] = 0xff; ol = 2; quote = 1; } else { ob[0] = c; ol = 1; } goto print; parmrk: if (CMP_FLAG(i, PARMRK)) { /* Prepend 0xff 0x00 0x.. */ ob[2] = c; ol = 3; quote = 1; } else { ob[0] = c; ol = 1; } print: /* See if we can store this on the input queue. */ if (ttyinq_write_nofrag(&tp->t_inq, ob, ol, quote) != 0) { if (CMP_FLAG(i, IMAXBEL)) ttyoutq_write_nofrag(&tp->t_outq, "\a", 1); /* * Prevent a deadlock here. It may be possible that a * user has entered so much data, there is no data * available to read(), but the buffers are full anyway. * * Only enter the high watermark if the device driver * can actually transmit something. */ if (ttyinq_bytescanonicalized(&tp->t_inq) == 0) return (0); tty_hiwat_in_block(tp); return (-1); } /* * In raw mode, we canonicalize after receiving a single * character. Otherwise, we canonicalize when we receive a * newline, VEOL or VEOF, but only when it isn't quoted. */ if (!CMP_FLAG(l, ICANON) || (!quote && (c == CNL || CMP_CC(VEOL, c) || CMP_CC(VEOF, c)))) { ttyinq_canonicalize(&tp->t_inq); } ttydisc_echo(tp, c, quote); return (0); } size_t ttydisc_rint_simple(struct tty *tp, const void *buf, size_t len) { const char *cbuf; if (ttydisc_can_bypass(tp)) return (ttydisc_rint_bypass(tp, buf, len)); for (cbuf = buf; len-- > 0; cbuf++) { if (ttydisc_rint(tp, *cbuf, 0) != 0) break; } return (cbuf - (const char *)buf); } size_t ttydisc_rint_bypass(struct tty *tp, const void *buf, size_t len) { size_t ret; tty_lock_assert(tp, MA_OWNED); MPASS(tp->t_flags & TF_BYPASS); atomic_add_long(&tty_nin, len); if (ttyhook_hashook(tp, rint_bypass)) return ttyhook_rint_bypass(tp, buf, len); ret = ttyinq_write(&tp->t_inq, buf, len, 0); ttyinq_canonicalize(&tp->t_inq); if (ret < len) tty_hiwat_in_block(tp); return (ret); } void ttydisc_rint_done(struct tty *tp) { tty_lock_assert(tp, MA_OWNED); if (ttyhook_hashook(tp, rint_done)) ttyhook_rint_done(tp); /* Wake up readers. */ tty_wakeup(tp, FREAD); /* Wake up driver for echo. */ ttydevsw_outwakeup(tp); } size_t ttydisc_rint_poll(struct tty *tp) { size_t l; tty_lock_assert(tp, MA_OWNED); if (ttyhook_hashook(tp, rint_poll)) return ttyhook_rint_poll(tp); /* * XXX: Still allow character input when there's no space in the * buffers, but we haven't entered the high watermark. This is * to allow backspace characters to be inserted when in * canonical mode. */ l = ttyinq_bytesleft(&tp->t_inq); if (l == 0 && (tp->t_flags & TF_HIWAT_IN) == 0) return (1); return (l); } static void ttydisc_wakeup_watermark(struct tty *tp) { size_t c; c = ttyoutq_bytesleft(&tp->t_outq); if (tp->t_flags & TF_HIWAT_OUT) { /* Only allow us to run when we're below the watermark. */ if (c < tp->t_outlow) return; /* Reset the watermark. */ tp->t_flags &= ~TF_HIWAT_OUT; } else { /* Only run when we have data at all. */ if (c == 0) return; } tty_wakeup(tp, FWRITE); } size_t ttydisc_getc(struct tty *tp, void *buf, size_t len) { tty_lock_assert(tp, MA_OWNED); if (tp->t_flags & TF_STOPPED) return (0); if (ttyhook_hashook(tp, getc_inject)) return ttyhook_getc_inject(tp, buf, len); len = ttyoutq_read(&tp->t_outq, buf, len); if (ttyhook_hashook(tp, getc_capture)) ttyhook_getc_capture(tp, buf, len); ttydisc_wakeup_watermark(tp); atomic_add_long(&tty_nout, len); return (len); } int ttydisc_getc_uio(struct tty *tp, struct uio *uio) { int error = 0; ssize_t obytes = uio->uio_resid; size_t len; char buf[TTY_STACKBUF]; tty_lock_assert(tp, MA_OWNED); if (tp->t_flags & TF_STOPPED) return (0); /* * When a TTY hook is attached, we cannot perform unbuffered * copying to userspace. Just call ttydisc_getc() and * temporarily store data in a shadow buffer. */ if (ttyhook_hashook(tp, getc_capture) || ttyhook_hashook(tp, getc_inject)) { while (uio->uio_resid > 0) { /* Read to shadow buffer. */ len = ttydisc_getc(tp, buf, MIN(uio->uio_resid, sizeof buf)); if (len == 0) break; /* Copy to userspace. */ tty_unlock(tp); error = uiomove(buf, len, uio); tty_lock(tp); if (error != 0) break; } } else { error = ttyoutq_read_uio(&tp->t_outq, tp, uio); ttydisc_wakeup_watermark(tp); atomic_add_long(&tty_nout, obytes - uio->uio_resid); } return (error); } size_t ttydisc_getc_poll(struct tty *tp) { tty_lock_assert(tp, MA_OWNED); if (tp->t_flags & TF_STOPPED) return (0); if (ttyhook_hashook(tp, getc_poll)) return ttyhook_getc_poll(tp); return ttyoutq_bytesused(&tp->t_outq); } /* * XXX: not really related to the TTYDISC, but we'd better put * tty_putchar() here, because we need to perform proper output * processing. */ int tty_putchar(struct tty *tp, char c) { tty_lock_assert(tp, MA_OWNED); if (tty_gone(tp)) return (-1); ttydisc_echo_force(tp, c, 0); tp->t_writepos = tp->t_column; ttyinq_reprintpos_set(&tp->t_inq); ttydevsw_outwakeup(tp); return (0); } Index: head/sys/kern/uipc_accf.c =================================================================== --- head/sys/kern/uipc_accf.c (revision 326270) +++ head/sys/kern/uipc_accf.c (revision 326271) @@ -1,306 +1,308 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000 Paycounter, Inc. * Copyright (c) 2005 Robert N. M. Watson * Author: Alfred Perlstein , * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define ACCEPT_FILTER_MOD #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct mtx accept_filter_mtx; MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx", MTX_DEF); #define ACCEPT_FILTER_LOCK() mtx_lock(&accept_filter_mtx) #define ACCEPT_FILTER_UNLOCK() mtx_unlock(&accept_filter_mtx) static SLIST_HEAD(, accept_filter) accept_filtlsthd = SLIST_HEAD_INITIALIZER(accept_filtlsthd); MALLOC_DEFINE(M_ACCF, "accf", "accept filter data"); static int unloadable = 0; SYSCTL_NODE(_net, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters"); SYSCTL_INT(_net_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, "Allow unload of accept filters (not recommended)"); /* * Must be passed a malloc'd structure so we don't explode if the kld is * unloaded, we leak the struct on deallocation to deal with this, but if a * filter is loaded with the same name as a leaked one we re-use the entry. */ int accept_filt_add(struct accept_filter *filt) { struct accept_filter *p; ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, filt->accf_name) == 0) { if (p->accf_callback != NULL) { ACCEPT_FILTER_UNLOCK(); return (EEXIST); } else { p->accf_callback = filt->accf_callback; ACCEPT_FILTER_UNLOCK(); free(filt, M_ACCF); return (0); } } if (p == NULL) SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next); ACCEPT_FILTER_UNLOCK(); return (0); } int accept_filt_del(char *name) { struct accept_filter *p; p = accept_filt_get(name); if (p == NULL) return (ENOENT); p->accf_callback = NULL; return (0); } struct accept_filter * accept_filt_get(char *name) { struct accept_filter *p; ACCEPT_FILTER_LOCK(); SLIST_FOREACH(p, &accept_filtlsthd, accf_next) if (strcmp(p->accf_name, name) == 0) break; ACCEPT_FILTER_UNLOCK(); return (p); } int accept_filt_generic_mod_event(module_t mod, int event, void *data) { struct accept_filter *p; struct accept_filter *accfp = (struct accept_filter *) data; int error; switch (event) { case MOD_LOAD: p = malloc(sizeof(*p), M_ACCF, M_WAITOK); bcopy(accfp, p, sizeof(*p)); error = accept_filt_add(p); break; case MOD_UNLOAD: /* * Do not support unloading yet. we don't keep track of * refcounts and unloading an accept filter callback and then * having it called is a bad thing. A simple fix would be to * track the refcount in the struct accept_filter. */ if (unloadable != 0) { error = accept_filt_del(accfp->accf_name); } else error = EOPNOTSUPP; break; case MOD_SHUTDOWN: error = 0; break; default: error = EOPNOTSUPP; break; } return (error); } int accept_filt_getopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; int error; error = 0; afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto out; } if (so->sol_accept_filter == NULL) { error = EINVAL; goto out; } strcpy(afap->af_name, so->sol_accept_filter->accf_name); if (so->sol_accept_filter_str != NULL) strcpy(afap->af_arg, so->sol_accept_filter_str); out: SOCK_UNLOCK(so); if (error == 0) error = sooptcopyout(sopt, afap, sizeof(*afap)); free(afap, M_TEMP); return (error); } int accept_filt_setopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; struct accept_filter *afp; char *accept_filter_str = NULL; void *accept_filter_arg = NULL; int error; /* * Handle the simple delete case first. */ if (sopt == NULL || sopt->sopt_val == NULL) { struct socket *sp, *sp1; int wakeup; SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { SOCK_UNLOCK(so); return (EINVAL); } if (so->sol_accept_filter == NULL) { SOCK_UNLOCK(so); return (0); } if (so->sol_accept_filter->accf_destroy != NULL) so->sol_accept_filter->accf_destroy(so); if (so->sol_accept_filter_str != NULL) free(so->sol_accept_filter_str, M_ACCF); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->so_options &= ~SO_ACCEPTFILTER; /* * Move from incomplete queue to complete only those * connections, that are blocked by us. */ wakeup = 0; TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) { SOCK_LOCK(sp); if (sp->so_options & SO_ACCEPTFILTER) { TAILQ_REMOVE(&so->sol_incomp, sp, so_list); TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list); sp->so_qstate = SQ_COMP; sp->so_options &= ~SO_ACCEPTFILTER; so->sol_incqlen--; so->sol_qlen++; wakeup = 1; } SOCK_UNLOCK(sp); } if (wakeup) solisten_wakeup(so); /* unlocks */ else SOLISTEN_UNLOCK(so); return (0); } /* * Pre-allocate any memory we may need later to avoid blocking at * untimely moments. This does not optimize for invalid arguments. */ afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; if (error) { free(afap, M_TEMP); return (error); } afp = accept_filt_get(afap->af_name); if (afp == NULL) { free(afap, M_TEMP); return (ENOENT); } if (afp->accf_create != NULL && afap->af_name[0] != '\0') { size_t len = strlen(afap->af_name) + 1; accept_filter_str = malloc(len, M_ACCF, M_WAITOK); strcpy(accept_filter_str, afap->af_name); } /* * Require a listen socket; don't try to replace an existing filter * without first removing it. */ SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0 || so->sol_accept_filter != NULL) { error = EINVAL; goto out; } /* * Invoke the accf_create() method of the filter if required. The * socket mutex is held over this call, so create methods for filters * can't block. */ if (afp->accf_create != NULL) { accept_filter_arg = afp->accf_create(so, afap->af_arg); if (accept_filter_arg == NULL) { error = EINVAL; goto out; } } so->sol_accept_filter = afp; so->sol_accept_filter_arg = accept_filter_arg; so->sol_accept_filter_str = accept_filter_str; so->so_options |= SO_ACCEPTFILTER; out: SOCK_UNLOCK(so); if (accept_filter_str != NULL) free(accept_filter_str, M_ACCF); free(afap, M_TEMP); return (error); } Index: head/sys/kern/uipc_debug.c =================================================================== --- head/sys/kern/uipc_debug.c (revision 326270) +++ head/sys/kern/uipc_debug.c (revision 326271) @@ -1,534 +1,536 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Debugger routines relating to sockets, protocols, etc, for use in DDB. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #ifdef DDB #include static void db_print_sotype(short so_type) { switch (so_type) { case SOCK_STREAM: db_printf("SOCK_STREAM"); break; case SOCK_DGRAM: db_printf("SOCK_DGRAM"); break; case SOCK_RAW: db_printf("SOCK_RAW"); break; case SOCK_RDM: db_printf("SOCK_RDM"); break; case SOCK_SEQPACKET: db_printf("SOCK_SEQPACKET"); break; default: db_printf("unknown"); break; } } static void db_print_sooptions(short so_options) { int comma; comma = 0; if (so_options & SO_DEBUG) { db_printf("%sSO_DEBUG", comma ? ", " : ""); comma = 1; } if (so_options & SO_ACCEPTCONN) { db_printf("%sSO_ACCEPTCONN", comma ? ", " : ""); comma = 1; } if (so_options & SO_REUSEADDR) { db_printf("%sSO_REUSEADDR", comma ? ", " : ""); comma = 1; } if (so_options & SO_KEEPALIVE) { db_printf("%sSO_KEEPALIVE", comma ? ", " : ""); comma = 1; } if (so_options & SO_DONTROUTE) { db_printf("%sSO_DONTROUTE", comma ? ", " : ""); comma = 1; } if (so_options & SO_BROADCAST) { db_printf("%sSO_BROADCAST", comma ? ", " : ""); comma = 1; } if (so_options & SO_USELOOPBACK) { db_printf("%sSO_USELOOPBACK", comma ? ", " : ""); comma = 1; } if (so_options & SO_LINGER) { db_printf("%sSO_LINGER", comma ? ", " : ""); comma = 1; } if (so_options & SO_OOBINLINE) { db_printf("%sSO_OOBINLINE", comma ? ", " : ""); comma = 1; } if (so_options & SO_REUSEPORT) { db_printf("%sSO_REUSEPORT", comma ? ", " : ""); comma = 1; } if (so_options & SO_TIMESTAMP) { db_printf("%sSO_TIMESTAMP", comma ? ", " : ""); comma = 1; } if (so_options & SO_NOSIGPIPE) { db_printf("%sSO_NOSIGPIPE", comma ? ", " : ""); comma = 1; } if (so_options & SO_ACCEPTFILTER) { db_printf("%sSO_ACCEPTFILTER", comma ? ", " : ""); comma = 1; } if (so_options & SO_BINTIME) { db_printf("%sSO_BINTIME", comma ? ", " : ""); comma = 1; } if (so_options & SO_NO_OFFLOAD) { db_printf("%sSO_NO_OFFLOAD", comma ? ", " : ""); comma = 1; } if (so_options & SO_NO_DDP) { db_printf("%sSO_NO_DDP", comma ? ", " : ""); comma = 1; } } static void db_print_sostate(short so_state) { int comma; comma = 0; if (so_state & SS_NOFDREF) { db_printf("%sSS_NOFDREF", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISCONNECTED) { db_printf("%sSS_ISCONNECTED", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISCONNECTING) { db_printf("%sSS_ISCONNECTING", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISDISCONNECTING) { db_printf("%sSS_ISDISCONNECTING", comma ? ", " : ""); comma = 1; } if (so_state & SS_NBIO) { db_printf("%sSS_NBIO", comma ? ", " : ""); comma = 1; } if (so_state & SS_ASYNC) { db_printf("%sSS_ASYNC", comma ? ", " : ""); comma = 1; } if (so_state & SS_ISCONFIRMING) { db_printf("%sSS_ISCONFIRMING", comma ? ", " : ""); comma = 1; } if (so_state & SS_PROTOREF) { db_printf("%sSS_PROTOREF", comma ? ", " : ""); comma = 1; } } static void db_print_soqstate(int so_qstate) { int comma; comma = 0; if (so_qstate & SQ_INCOMP) { db_printf("%sSQ_INCOMP", comma ? ", " : ""); comma = 1; } if (so_qstate & SQ_COMP) { db_printf("%sSQ_COMP", comma ? ", " : ""); comma = 1; } } static void db_print_sbstate(short sb_state) { int comma; comma = 0; if (sb_state & SBS_CANTSENDMORE) { db_printf("%sSBS_CANTSENDMORE", comma ? ", " : ""); comma = 1; } if (sb_state & SBS_CANTRCVMORE) { db_printf("%sSBS_CANTRCVMORE", comma ? ", " : ""); comma = 1; } if (sb_state & SBS_RCVATMARK) { db_printf("%sSBS_RCVATMARK", comma ? ", " : ""); comma = 1; } } static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_domain(struct domain *d, const char *domain_name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", domain_name, d); indent += 2; db_print_indent(indent); db_printf("dom_family: %d ", d->dom_family); db_printf("dom_name: %s\n", d->dom_name); db_print_indent(indent); db_printf("dom_init: %p ", d->dom_init); db_printf("dom_externalize: %p ", d->dom_externalize); db_printf("dom_dispose: %p\n", d->dom_dispose); db_print_indent(indent); db_printf("dom_protosw: %p ", d->dom_protosw); db_printf("dom_next: %p\n", d->dom_next); db_print_indent(indent); db_printf("dom_rtattach: %p ", d->dom_rtattach); db_print_indent(indent); db_printf("dom_ifattach: %p ", d->dom_ifattach); db_printf("dom_ifdetach: %p\n", d->dom_ifdetach); } static void db_print_prflags(short pr_flags) { int comma; comma = 0; if (pr_flags & PR_ATOMIC) { db_printf("%sPR_ATOMIC", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_ADDR) { db_printf("%sPR_ADDR", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_CONNREQUIRED) { db_printf("%sPR_CONNREQUIRED", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_WANTRCVD) { db_printf("%sPR_WANTRCVD", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_RIGHTS) { db_printf("%sPR_RIGHTS", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_IMPLOPCL) { db_printf("%sPR_IMPLOPCL", comma ? ", " : ""); comma = 1; } if (pr_flags & PR_LASTHDR) { db_printf("%sPR_LASTHDR", comma ? ", " : ""); comma = 1; } } static void db_print_protosw(struct protosw *pr, const char *prname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", prname, pr); indent += 2; db_print_indent(indent); db_printf("pr_type: %d ", pr->pr_type); db_printf("pr_domain: %p\n", pr->pr_domain); if (pr->pr_domain != NULL) db_print_domain(pr->pr_domain, "pr_domain", indent); db_print_indent(indent); db_printf("pr_protocol: %d\n", pr->pr_protocol); db_print_indent(indent); db_printf("pr_flags: %d (", pr->pr_flags); db_print_prflags(pr->pr_flags); db_printf(")\n"); db_print_indent(indent); db_printf("pr_input: %p ", pr->pr_input); db_printf("pr_output: %p ", pr->pr_output); db_printf("pr_ctlinput: %p\n", pr->pr_ctlinput); db_print_indent(indent); db_printf("pr_ctloutput: %p ", pr->pr_ctloutput); db_printf("pr_init: %p\n", pr->pr_init); db_print_indent(indent); db_printf("pr_fasttimo: %p ", pr->pr_fasttimo); db_printf("pr_slowtimo: %p ", pr->pr_slowtimo); db_printf("pr_drain: %p\n", pr->pr_drain); } static void db_print_sbflags(short sb_flags) { int comma; comma = 0; if (sb_flags & SB_WAIT) { db_printf("%sSB_WAIT", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_SEL) { db_printf("%sSB_SEL", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_ASYNC) { db_printf("%sSB_ASYNC", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_UPCALL) { db_printf("%sSB_UPCALL", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_NOINTR) { db_printf("%sSB_NOINTR", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_AIO) { db_printf("%sSB_AIO", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_KNOTE) { db_printf("%sSB_KNOTE", comma ? ", " : ""); comma = 1; } if (sb_flags & SB_AUTOSIZE) { db_printf("%sSB_AUTOSIZE", comma ? ", " : ""); comma = 1; } } static void db_print_sockbuf(struct sockbuf *sb, const char *sockbufname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", sockbufname, sb); indent += 2; db_print_indent(indent); db_printf("sb_state: 0x%x (", sb->sb_state); db_print_sbstate(sb->sb_state); db_printf(")\n"); db_print_indent(indent); db_printf("sb_mb: %p ", sb->sb_mb); db_printf("sb_mbtail: %p ", sb->sb_mbtail); db_printf("sb_lastrecord: %p\n", sb->sb_lastrecord); db_print_indent(indent); db_printf("sb_sndptr: %p ", sb->sb_sndptr); db_printf("sb_sndptroff: %u\n", sb->sb_sndptroff); db_print_indent(indent); db_printf("sb_acc: %u ", sb->sb_acc); db_printf("sb_ccc: %u ", sb->sb_ccc); db_printf("sb_hiwat: %u ", sb->sb_hiwat); db_printf("sb_mbcnt: %u ", sb->sb_mbcnt); db_printf("sb_mbmax: %u\n", sb->sb_mbmax); db_print_indent(indent); db_printf("sb_mcnt: %u ", sb->sb_mcnt); db_printf("sb_ccnt: %u ", sb->sb_ccnt); db_printf("sb_ctl: %u ", sb->sb_ctl); db_printf("sb_lowat: %d ", sb->sb_lowat); db_printf("sb_timeo: %jd\n", sb->sb_timeo); db_print_indent(indent); db_printf("sb_flags: 0x%x (", sb->sb_flags); db_print_sbflags(sb->sb_flags); db_printf(")\n"); db_print_indent(indent); db_printf("sb_aiojobq first: %p\n", TAILQ_FIRST(&sb->sb_aiojobq)); } static void db_print_socket(struct socket *so, const char *socketname, int indent) { db_print_indent(indent); db_printf("%s at %p\n", socketname, so); indent += 2; db_print_indent(indent); db_printf("so_count: %d ", so->so_count); db_printf("so_type: %d (", so->so_type); db_print_sotype(so->so_type); db_printf(")\n"); db_print_indent(indent); db_printf("so_options: 0x%x (", so->so_options); db_print_sooptions(so->so_options); db_printf(")\n"); db_print_indent(indent); db_printf("so_linger: %d ", so->so_linger); db_printf("so_state: 0x%x (", so->so_state); db_print_sostate(so->so_state); db_printf(")\n"); db_print_indent(indent); db_printf("so_pcb: %p ", so->so_pcb); db_printf("so_proto: %p\n", so->so_proto); if (so->so_proto != NULL) db_print_protosw(so->so_proto, "so_proto", indent); db_print_indent(indent); if (so->so_options & SO_ACCEPTCONN) { db_printf("sol_incomp first: %p ", TAILQ_FIRST(&so->sol_incomp)); db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp)); db_printf("sol_qlen: %d ", so->sol_qlen); db_printf("sol_incqlen: %d ", so->sol_incqlen); db_printf("sol_qlimit: %d ", so->sol_qlimit); } else { db_printf("so_qstate: 0x%x (", so->so_qstate); db_print_soqstate(so->so_qstate); db_printf(") "); db_printf("so_listen: %p ", so->so_listen); /* so_list skipped */ db_printf("so_timeo: %d ", so->so_timeo); db_printf("so_error: %d\n", so->so_error); db_print_indent(indent); db_printf("so_sigio: %p ", so->so_sigio); db_printf("so_oobmark: %lu\n", so->so_oobmark); db_print_sockbuf(&so->so_rcv, "so_rcv", indent); db_print_sockbuf(&so->so_snd, "so_snd", indent); } } DB_SHOW_COMMAND(socket, db_show_socket) { struct socket *so; if (!have_addr) { db_printf("usage: show socket \n"); return; } so = (struct socket *)addr; db_print_socket(so, "socket", 0); } DB_SHOW_COMMAND(sockbuf, db_show_sockbuf) { struct sockbuf *sb; if (!have_addr) { db_printf("usage: show sockbuf \n"); return; } sb = (struct sockbuf *)addr; db_print_sockbuf(sb, "sockbuf", 0); } DB_SHOW_COMMAND(protosw, db_show_protosw) { struct protosw *pr; if (!have_addr) { db_printf("usage: show protosw \n"); return; } pr = (struct protosw *)addr; db_print_protosw(pr, "protosw", 0); } DB_SHOW_COMMAND(domain, db_show_domain) { struct domain *d; if (!have_addr) { db_printf("usage: show protosw \n"); return; } d = (struct domain *)addr; db_print_domain(d, "domain", 0); } #endif Index: head/sys/kern/uipc_mqueue.c =================================================================== --- head/sys/kern/uipc_mqueue.c (revision 326270) +++ head/sys/kern/uipc_mqueue.c (revision 326271) @@ -1,2939 +1,2941 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 David Xu * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support"); /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; const void *mn_pr_root; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; uint32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; static unsigned mqfs_osd_jail_slot; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); static int mqfs_prison_remove(void *obj, void *data); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, }; struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_fetchadd_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_pr_root = cred->cr_prison->pr_root; node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = mqfs_prison_remove, }; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); mqfs_osd_jail_slot = osd_jail_register(NULL, methods); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); osd_jail_deregister(mqfs_osd_jail_slot); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred) { struct mqfs_node *pn; const void *pr_root; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); pr_root = cred->cr_prison->pr_root; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { /* Only match names within the same prison root directory */ if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) && strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; int nameiop, flags, error, namelen; char *pname; struct thread *td; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen, cnp->cn_cred); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred, NULL); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqfs_node *pn; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); pn = VTON(vp); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; const void *pr_root; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; pr_root = ap->a_cred->cr_prison->pr_root; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); /* * Only show names within the same prison root directory * (or not associated with a prison, e.g. "." and ".."). */ if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root) continue; if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_name[i] = 0; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * See if this prison root is obsolete, and clean up associated queues if it is. */ static int mqfs_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; const struct prison *tpr; struct mqfs_node *pn, *tpn; int found; found = 0; TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0) found = 1; } if (!found) { /* * No jails are rooted in this directory anymore, * so no queues should be either. */ sx_xlock(&mqfs_data.mi_lock); LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children, mn_sibling, tpn) { if (pn->mn_pr_root == pr->pr_root) (void)do_unlink(pn, curthread->td_ucred); } sx_xunlock(&mqfs_data.mi_lock); } return (0); } /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); seldrain(&mq->mq_rsel); seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { ts2 = *abs_timeout; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct thread *td; struct proc *p; int error; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; error = sigev_findtd(p, &nt->nt_sigev, &td); if (error) { mq->mq_notifier = NULL; return; } if (!KSI_ONQ(&nt->nt_ksi)) { ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev); tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi); } PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { ts2 = *abs_timeout; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } static int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char path[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct filedesc *fdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); fdp = td->td_proc->p_fd; cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred, NULL); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(td, fp, fd); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int sys_kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, rightsp, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { cap_rights_t rights; return _getmq(td, fd, cap_rights_init(&rights, CAP_EVENT), fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { cap_rights_t rights; return _getmq(td, fd, cap_rights_init(&rights, CAP_READ), fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { cap_rights_t rights; return _getmq(td, fd, cap_rights_init(&rights, CAP_WRITE), fget_write, fpp, ppn, pmq); } static int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; AUDIT_ARG_FD(mqd); if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { bzero(oattr.__reserved, sizeof(oattr.__reserved)); error = copyout(&oattr, uap->oattr, sizeof(oattr)); } return (error); } int sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } static int kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev) { #ifdef CAPABILITIES cap_rights_t rights; #endif struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; AUDIT_ARG_FD(mqd); if (sigev != NULL) { if (sigev->sigev_notify != SIGEV_SIGNAL && sigev->sigev_notify != SIGEV_THREAD_ID && sigev->sigev_notify != SIGEV_NONE) return (EINVAL); if ((sigev->sigev_notify == SIGEV_SIGNAL || sigev->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(sigev->sigev_signo)) return (EINVAL); } p = td->td_proc; fdp = td->td_proc->p_fd; error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); fp2 = fget_locked(fdp, mqd); if (fp2 == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights(fdp, mqd), cap_rights_init(&rights, CAP_EVENT)); if (error) { FILEDESC_SUNLOCK(fdp); goto out; } #endif if (fp2 != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = mqd; notifier_insert(p, nt); } nt->nt_sigev = *sigev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } int sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev, *evp; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct filedesc *fdp; struct mqueue *mq; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); sx_xlock(&mqfs_data.mi_lock); st->st_atim = pn->mn_atime; st->st_mtim = pn->mn_mtime; st->st_ctim = pn->mn_ctime; st->st_birthtim = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; pn->mn_mode = mode & ACCESSPERMS; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); if (uid == (uid_t)-1) uid = pn->mn_uid; if (gid == (gid_t)-1) gid = pn->mn_gid; if (((uid != pn->mn_uid && uid != active_cred->cr_uid) || (gid != pn->mn_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; pn->mn_uid = uid; pn->mn_gid = gid; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static int mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_MQUEUE; return (0); } static struct fileops mqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close, .fo_chmod = mqf_chmod, .fo_chown = mqf_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = mqf_fill_kinfo, }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); bzero(oattr32.__reserved, sizeof(oattr32.__reserved)); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap) { struct sigevent ev, *evp; struct sigevent32 ev32; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev32, sizeof(ev32)); if (error != 0) return (error); error = convert_sigevent32(&ev32, &ev); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); Index: head/sys/kern/uipc_sem.c =================================================================== --- head/sys/kern/uipc_sem.c (revision 326270) +++ head/sys/kern/uipc_sem.c (revision 326271) @@ -1,1111 +1,1113 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Alfred Perlstein * Copyright (c) 2003-2005 SPARTA, Inc. * Copyright (c) 2005, 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support"); /* * TODO * * - Resource limits? * - Replace global sem_lock with mtx_pool locks? * - Add a MAC check_create() hook for creating new named semaphores. */ #ifndef SEM_MAX #define SEM_MAX 30 #endif #ifdef SEM_DEBUG #define DP(x) printf x #else #define DP(x) #endif struct ksem_mapping { char *km_path; Fnv32_t km_fnv; struct ksem *km_ksem; LIST_ENTRY(ksem_mapping) km_link; }; static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor"); static LIST_HEAD(, ksem_mapping) *ksem_dictionary; static struct sx ksem_dict_lock; static struct mtx ksem_count_lock; static struct mtx sem_lock; static u_long ksem_hash; static int ksem_dead; #define KSEM_HASH(fnv) (&ksem_dictionary[(fnv) & ksem_hash]) static int nsems = 0; SYSCTL_DECL(_p1003_1b); SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0, "Number of active kernel POSIX semaphores"); static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime); static int ksem_access(struct ksem *ks, struct ucred *ucred); static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value); static int ksem_create(struct thread *td, const char *path, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32); static void ksem_drop(struct ksem *ks); static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp); static struct ksem *ksem_hold(struct ksem *ks); static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks); static struct ksem *ksem_lookup(char *path, Fnv32_t fnv); static void ksem_module_destroy(void); static int ksem_module_init(void); static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static int sem_modload(struct module *module, int cmd, void *arg); static fo_stat_t ksem_stat; static fo_close_t ksem_closef; static fo_chmod_t ksem_chmod; static fo_chown_t ksem_chown; static fo_fill_kinfo_t ksem_fill_kinfo; /* File descriptor operations. */ static struct fileops ksem_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = ksem_stat, .fo_close = ksem_closef, .fo_chmod = ksem_chmod, .fo_chown = ksem_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ksem_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; FEATURE(posix_sem, "POSIX semaphores"); static int ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct ksem *ks; #ifdef MAC int error; #endif ks = fp->f_data; #ifdef MAC error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a semaphore * file descriptor. */ bzero(sb, sizeof(*sb)); mtx_lock(&sem_lock); sb->st_atim = ks->ks_atime; sb->st_ctim = ks->ks_ctime; sb->st_mtim = ks->ks_mtime; sb->st_birthtim = ks->ks_birthtime; sb->st_uid = ks->ks_uid; sb->st_gid = ks->ks_gid; sb->st_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); return (0); } static int ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setmode(active_cred, ks, mode); if (error != 0) goto out; #endif error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; ks->ks_mode = mode & ACCESSPERMS; out: mtx_unlock(&sem_lock); return (error); } static int ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setowner(active_cred, ks, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = ks->ks_uid; if (gid == (gid_t)-1) gid = ks->ks_gid; if (((uid != ks->ks_uid && uid != active_cred->cr_uid) || (gid != ks->ks_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; ks->ks_uid = uid; ks->ks_gid = gid; out: mtx_unlock(&sem_lock); return (error); } static int ksem_closef(struct file *fp, struct thread *td) { struct ksem *ks; ks = fp->f_data; fp->f_data = NULL; ksem_drop(ks); return (0); } static int ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { const char *path, *pr_path; struct ksem *ks; size_t pr_pathlen; kif->kf_type = KF_TYPE_SEM; ks = fp->f_data; mtx_lock(&sem_lock); kif->kf_un.kf_sem.kf_sem_value = ks->ks_value; kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); if (ks->ks_path != NULL) { sx_slock(&ksem_dict_lock); if (ks->ks_path != NULL) { path = ks->ks_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); if (strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/') path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } sx_sunlock(&ksem_dict_lock); } return (0); } /* * ksem object management including creation and reference counting * routines. */ static struct ksem * ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value) { struct ksem *ks; mtx_lock(&ksem_count_lock); if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) { mtx_unlock(&ksem_count_lock); return (NULL); } nsems++; mtx_unlock(&ksem_count_lock); ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO); ks->ks_uid = ucred->cr_uid; ks->ks_gid = ucred->cr_gid; ks->ks_mode = mode; ks->ks_value = value; cv_init(&ks->ks_cv, "ksem"); vfs_timestamp(&ks->ks_birthtime); ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime; refcount_init(&ks->ks_ref, 1); #ifdef MAC mac_posixsem_init(ks); mac_posixsem_create(ucred, ks); #endif return (ks); } static struct ksem * ksem_hold(struct ksem *ks) { refcount_acquire(&ks->ks_ref); return (ks); } static void ksem_drop(struct ksem *ks) { if (refcount_release(&ks->ks_ref)) { #ifdef MAC mac_posixsem_destroy(ks); #endif cv_destroy(&ks->ks_cv); free(ks, M_KSEM); mtx_lock(&ksem_count_lock); nsems--; mtx_unlock(&ksem_count_lock); } } /* * Determine if the credentials have sufficient permissions for read * and write access. */ static int ksem_access(struct ksem *ks, struct ucred *ucred) { int error; error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VREAD | VWRITE, ucred, NULL); if (error) error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to semaphore objects. We use the FNV hash on the path to * store the mappings in a hash table. */ static struct ksem * ksem_lookup(char *path, Fnv32_t fnv) { struct ksem_mapping *map; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) return (map->km_ksem); } return (NULL); } static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks) { struct ksem_mapping *map; map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK); map->km_path = path; map->km_fnv = fnv; map->km_ksem = ksem_hold(ks); ks->ks_path = path; LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link); } static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct ksem_mapping *map; int error; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) { #ifdef MAC error = mac_posixsem_check_unlink(ucred, map->km_ksem); if (error) return (error); #endif error = ksem_access(map->km_ksem, ucred); if (error) return (error); map->km_ksem->ks_path = NULL; LIST_REMOVE(map, km_link); ksem_drop(map->km_ksem); free(map->km_path, M_KSEM); free(map, M_KSEM); return (0); } } return (ENOENT); } static int ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd, int compat32) { semid_t semid; #ifdef COMPAT_FREEBSD32 int32_t semid32; #endif void *ptr; size_t ptrs; #ifdef COMPAT_FREEBSD32 if (compat32) { semid32 = fd; ptr = &semid32; ptrs = sizeof(semid32); } else { #endif semid = fd; ptr = &semid; ptrs = sizeof(semid); compat32 = 0; /* silence gcc */ #ifdef COMPAT_FREEBSD32 } #endif return (copyout(ptr, semidp, ptrs)); } /* Other helper routines. */ static int ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32) { struct filedesc *fdp; struct ksem *ks; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error, fd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); AUDIT_ARG_VALUE(value); if (value > SEM_VALUE_MAX) return (EINVAL); fdp = td->td_proc->p_fd; mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) { if (name == NULL) error = ENOSPC; return (error); } /* * Go ahead and copyout the file descriptor now. This is a bit * premature, but it is a lot easier to handle errors as opposed * to later when we've possibly created a new semaphore, etc. */ error = ksem_create_copyout_semid(td, semidp, fd, compat32); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } if (name == NULL) { /* Create an anonymous semaphore. */ ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENOSPC; else ks->ks_flags |= KS_ANONYMOUS; } else { path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_KSEM); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); ks = ksem_lookup(path, fnv); if (ks == NULL) { /* Object does not exist, create it if requested. */ if (flags & O_CREAT) { ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENFILE; else { ksem_insert(path, fnv, ks); path = NULL; } } else error = ENOENT; } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixsem_check_open(td->td_ucred, ks); if (error == 0) #endif error = ksem_access(ks, td->td_ucred); } if (error == 0) ksem_hold(ks); #ifdef INVARIANTS else ks = NULL; #endif } sx_xunlock(&ksem_dict_lock); if (path) free(path, M_KSEM); } if (error) { KASSERT(ks == NULL, ("ksem_create error with a ksem")); fdclose(td, fp, fd); fdrop(fp, td); return (error); } KASSERT(ks != NULL, ("ksem_create w/o a ksem")); finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops); fdrop(fp, td); return (0); } static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp) { struct ksem *ks; struct file *fp; int error; error = fget(td, id, rightsp, &fp); if (error) return (EINVAL); if (fp->f_type != DTYPE_SEM) { fdrop(fp, td); return (EINVAL); } ks = fp->f_data; if (ks->ks_flags & KS_DEAD) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* System calls. */ #ifndef _SYS_SYSPROTO_H_ struct ksem_init_args { unsigned int value; semid_t *idp; }; #endif int sys_ksem_init(struct thread *td, struct ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_open_args { char *name; int oflag; mode_t mode; unsigned int value; semid_t *idp; }; #endif int sys_ksem_open(struct thread *td, struct ksem_open_args *uap) { DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid)); if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_unlink_args { char *name; }; #endif int sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); error = ksem_remove(path, fnv, td->td_ucred); sx_xunlock(&ksem_dict_lock); free(path, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_close_args { semid_t id; }; #endif int sys_ksem_close(struct thread *td, struct ksem_close_args *uap) { cap_rights_t rights; struct ksem *ks; struct file *fp; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init(&rights), &fp); if (error) return (error); ks = fp->f_data; if (ks->ks_flags & KS_ANONYMOUS) { fdrop(fp, td); return (EINVAL); } error = kern_close(td, uap->id); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_post_args { semid_t id; }; #endif int sys_ksem_post(struct thread *td, struct ksem_post_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init(&rights, CAP_SEM_POST), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks); if (error) goto err; #endif if (ks->ks_value == SEM_VALUE_MAX) { error = EOVERFLOW; goto err; } ++ks->ks_value; if (ks->ks_waiters > 0) cv_signal(&ks->ks_cv); error = 0; vfs_timestamp(&ks->ks_ctime); err: mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_wait_args { semid_t id; }; #endif int sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap) { return (kern_sem_wait(td, uap->id, 0, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_timedwait_args { semid_t id; const struct timespec *abstime; }; #endif int sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap) { struct timespec abstime; struct timespec *ts; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime, sizeof(abstime)); if (error != 0) return (error); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_trywait_args { semid_t id; }; #endif int sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap) { return (kern_sem_wait(td, uap->id, 1, NULL)); } static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime) { struct timespec ts1, ts2; struct timeval tv; cap_rights_t rights; struct file *fp; struct ksem *ks; int error; DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid)); AUDIT_ARG_FD(id); error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); DP((">>> kern_sem_wait critical section entered! pid=%d\n", (int)td->td_proc->p_pid)); #ifdef MAC error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks); if (error) { DP(("kern_sem_wait mac failed\n")); goto err; } #endif DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag)); vfs_timestamp(&ks->ks_atime); while (ks->ks_value == 0) { ks->ks_waiters++; if (tryflag != 0) error = EAGAIN; else if (abstime == NULL) error = cv_wait_sig(&ks->ks_cv, &sem_lock); else { for (;;) { ts1 = *abstime; getnanotime(&ts2); timespecsub(&ts1, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts1); if (tv.tv_sec < 0) { error = ETIMEDOUT; break; } error = cv_timedwait_sig(&ks->ks_cv, &sem_lock, tvtohz(&tv)); if (error != EWOULDBLOCK) break; } } ks->ks_waiters--; if (error) goto err; } ks->ks_value--; DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value)); error = 0; err: mtx_unlock(&sem_lock); fdrop(fp, td); DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n", (int)td->td_proc->p_pid, error)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_getvalue_args { semid_t id; int *val; }; #endif int sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error, val; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks); if (error) { mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #endif val = ks->ks_value; vfs_timestamp(&ks->ks_atime); mtx_unlock(&sem_lock); fdrop(fp, td); error = copyout(&val, uap->val, sizeof(val)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_destroy_args { semid_t id; }; #endif int sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init(&rights), &fp); if (error) return (error); ks = fp->f_data; if (!(ks->ks_flags & KS_ANONYMOUS)) { fdrop(fp, td); return (EINVAL); } mtx_lock(&sem_lock); if (ks->ks_waiters != 0) { mtx_unlock(&sem_lock); error = EBUSY; goto err; } ks->ks_flags |= KS_DEAD; mtx_unlock(&sem_lock); error = kern_close(td, uap->id); err: fdrop(fp, td); return (error); } static struct syscall_helper_data ksem_syscalls[] = { SYSCALL_INIT_HELPER(ksem_init), SYSCALL_INIT_HELPER(ksem_open), SYSCALL_INIT_HELPER(ksem_unlink), SYSCALL_INIT_HELPER(ksem_close), SYSCALL_INIT_HELPER(ksem_post), SYSCALL_INIT_HELPER(ksem_wait), SYSCALL_INIT_HELPER(ksem_timedwait), SYSCALL_INIT_HELPER(ksem_trywait), SYSCALL_INIT_HELPER(ksem_getvalue), SYSCALL_INIT_HELPER(ksem_destroy), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include int freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 1)); } int freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap) { if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 1)); } int freebsd32_ksem_timedwait(struct thread *td, struct freebsd32_ksem_timedwait_args *uap) { struct timespec32 abstime32; struct timespec *ts, abstime; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime32, sizeof(abstime32)); if (error != 0) return (error); CP(abstime32, abstime, tv_sec); CP(abstime32, abstime, tv_nsec); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } static struct syscall_helper_data ksem32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_ksem_init), SYSCALL32_INIT_HELPER(freebsd32_ksem_open), SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink), SYSCALL32_INIT_HELPER_COMPAT(ksem_close), SYSCALL32_INIT_HELPER_COMPAT(ksem_post), SYSCALL32_INIT_HELPER_COMPAT(ksem_wait), SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait), SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait), SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue), SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy), SYSCALL_INIT_LAST }; #endif static int ksem_module_init(void) { int error; mtx_init(&sem_lock, "sem", NULL, MTX_DEF); mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF); sx_init(&ksem_dict_lock, "ksem dictionary"); ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L); p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX); p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX); error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #endif return (0); } static void ksem_module_destroy(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(ksem32_syscalls); #endif syscall_helper_unregister(ksem_syscalls); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0); hashdestroy(ksem_dictionary, M_KSEM, ksem_hash); sx_destroy(&ksem_dict_lock); mtx_destroy(&ksem_count_lock); mtx_destroy(&sem_lock); p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX); p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX); } static int sem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = ksem_module_init(); if (error) ksem_module_destroy(); break; case MOD_UNLOAD: mtx_lock(&ksem_count_lock); if (nsems != 0) { error = EOPNOTSUPP; mtx_unlock(&ksem_count_lock); break; } ksem_dead = 1; mtx_unlock(&ksem_count_lock); ksem_module_destroy(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sem_mod = { "sem", &sem_modload, NULL }; DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sem, 1); Index: head/sys/kern/uipc_shm.c =================================================================== --- head/sys/kern/uipc_shm.c (revision 326270) +++ head/sys/kern/uipc_shm.c (revision 326271) @@ -1,1119 +1,1121 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2) and shm_unlink(2). While most of the implementation is * here, vm_mmap.c contains mapping logic changes. * * TODO: * * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1) * and ipcrm(1) be expanded or should new tools to manage both POSIX * kernel semaphores and POSIX shared memory be written? * * (2) Add support for this file type to fstat(1). * * (3) Resource limits? Does this need its own resource limits or are the * existing limits in mmap(2) sufficient? */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr *shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; /* File descriptor operations. */ struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; FEATURE(posix_shm, "POSIX shared memory"); static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); VM_OBJECT_WLOCK(obj); /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Parallel reads of the page content from disk are prevented * by exclusive busy. * * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (m->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(m); if (vm_pager_has_page(obj, idx, NULL, NULL)) { rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { printf( "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", obj, idx, m->valid, rv); vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); return (EIO); } } else vm_page_zero_invalid(m, TRUE); vm_page_xunbusy(m); } vm_page_lock(m); vm_page_hold(m); if (vm_page_active(m)) vm_page_reference(m); else vm_page_activate(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) { VM_OBJECT_WLOCK(obj); vm_page_dirty(m); vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(obj); } vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); if ((flags & FOF_OFFSET) == 0) { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); } error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } static int shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; sb->st_blocks = howmany(sb->st_size, sb->st_blksize); mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } int shm_dotruncate(struct shmfd *shmfd, off_t length) { vm_object_t object; vm_page_t m; vm_pindex_t idx, nobjsize; vm_ooffset_t delta; int base, rv; KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; VM_OBJECT_WLOCK(object); if (length == shmfd->shm_size) { VM_OBJECT_WUNLOCK(object); return (0); } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) { VM_OBJECT_WUNLOCK(object); return (EBUSY); } /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(length); retry: m = vm_page_lookup(object, idx); if (m != NULL) { if (vm_page_sleep_if_busy(m, "shmtrc")) goto retry; } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); vm_page_lock(m); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, * and therefore not recently * accessed, immediately enqueue it * for asynchronous laundering. The * current operation is not regarded * as an access. */ vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); } else { vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_pager_page_unswapped(m); } } delta = IDX_TO_OFF(object->size - nobjsize); /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) swap_pager_freespace(object, nobjsize, delta); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { /* Try to reserve additional swap space. */ delta = IDX_TO_OFF(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) { VM_OBJECT_WUNLOCK(object); return (ENOMEM); } object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; VM_OBJECT_WUNLOCK(object); return (0); } /* * shmfd object management including creation and reference counting * routines. */ struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode) { struct shmfd *shmfd; int ino; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); shmfd->shm_object->pg_color = 0; VM_OBJECT_WLOCK(shmfd->shm_object); vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); VM_OBJECT_WUNLOCK(shmfd->shm_object); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; ino = alloc_unr(shm_ino_unr); if (ino == -1) shmfd->shm_ino = 0; else shmfd->shm_ino = ino; refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); vm_object_deallocate(shmfd->shm_object); if (shmfd->shm_ino != 0) free_unr(shm_ino_unr, shmfd->shm_ino); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred, NULL); mtx_unlock(&shm_timestamp_lock); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static void shm_init(void *arg) { mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL); KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized")); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); return (0); } } return (ENOENT); } int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, struct filecaps *fcaps) { struct filedesc *fdp; struct shmfd *shmfd; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; mode_t cmode; int fd, error; #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) return (ECAPMODE); #endif AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); fdp = td->td_proc->p_fd; cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps); if (error) return (error); /* A SHM_ANON path pointer creates an anonymous object. */ if (userpath == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode); } else { path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(userpath, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); #ifdef KTRACE if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_SHMFD); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode); shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif shm_dotruncate(shmfd, 0); } if (error == 0) shm_hold(shmfd); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* System calls. */ int sys_shm_open(struct thread *td, struct shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL)); } int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_TEMP); return (error); } int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; vm_prot_t maxprot; int error; shmfd = fp->f_data; maxprot = VM_PROT_NONE; /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; /* Don't permit shared writable mappings on read-only descriptors. */ if ((flags & MAP_SHARED) != 0 && (maxprot & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); maxprot &= cap_maxprot; /* See comment in vn_mmap(). */ if ( #ifdef _LP64 objsize > OFF_MAX || #endif foff < 0 || foff > OFF_MAX - objsize) return (EINVAL); #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) return (error); #endif mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, shmfd->shm_object, foff, FALSE, td); if (error != 0) vm_object_deallocate(shmfd->shm_object); return (error); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { const char *path, *pr_path; struct shmfd *shmfd; size_t pr_pathlen; kif->kf_type = KF_TYPE_SHM; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; /* XXX */ mtx_unlock(&shm_timestamp_lock); kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { sx_slock(&shm_dict_lock); if (shmfd->shm_path != NULL) { path = shmfd->shm_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); if (strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/') path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } sx_sunlock(&shm_dict_lock); } return (0); } Index: head/sys/kern/vfs_acl.c =================================================================== --- head/sys/kern/vfs_acl.c (revision 326270) +++ head/sys/kern/vfs_acl.c (revision 326271) @@ -1,588 +1,590 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999-2006, 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Developed by the TrustedBSD Project. * * ACL system calls and other functions common across different ACL types. * Type-specific routines go into subr_acl_.c. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES); MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists"); static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); int acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest) { int i; if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES) return (EINVAL); bzero(dest, sizeof(*dest)); dest->acl_cnt = source->acl_cnt; dest->acl_maxcnt = ACL_MAX_ENTRIES; for (i = 0; i < dest->acl_cnt; i++) { dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag; dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id; dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm; } return (0); } int acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest) { int i; if (source->acl_cnt > OLDACL_MAX_ENTRIES) return (EINVAL); bzero(dest, sizeof(*dest)); dest->acl_cnt = source->acl_cnt; for (i = 0; i < dest->acl_cnt; i++) { dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag; dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id; dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm; } return (0); } /* * At one time, "struct ACL" was extended in order to add support for NFSv4 * ACLs. Instead of creating compatibility versions of all the ACL-related * syscalls, they were left intact. It's possible to find out what the code * calling these syscalls (libc) expects basing on "type" argument - if it's * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct * oldacl". If it's something else, then it's the new "struct acl". In the * latter case, the routines below just copyin/copyout the contents. In the * former case, they copyin the "struct oldacl" and convert it to the new * format. */ static int acl_copyin(void *user_acl, struct acl *kernel_acl, acl_type_t type) { int error; struct oldacl old; switch (type) { case ACL_TYPE_ACCESS_OLD: case ACL_TYPE_DEFAULT_OLD: error = copyin(user_acl, &old, sizeof(old)); if (error != 0) break; acl_copy_oldacl_into_acl(&old, kernel_acl); break; default: error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl)); if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES) return (EINVAL); } return (error); } static int acl_copyout(struct acl *kernel_acl, void *user_acl, acl_type_t type) { uint32_t am; int error; struct oldacl old; switch (type) { case ACL_TYPE_ACCESS_OLD: case ACL_TYPE_DEFAULT_OLD: error = acl_copy_acl_into_oldacl(kernel_acl, &old); if (error != 0) break; error = copyout(&old, user_acl, sizeof(old)); break; default: error = fueword32((char *)user_acl + offsetof(struct acl, acl_maxcnt), &am); if (error == -1) return (EFAULT); if (am != ACL_MAX_ENTRIES) return (EINVAL); error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl)); } return (error); } /* * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new" * counterpart. It's required for old (pre-NFSv4 ACLs) libc to work * with new kernel. Fixing 'type' for old binaries with new libc * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold(). */ static int acl_type_unold(int type) { switch (type) { case ACL_TYPE_ACCESS_OLD: return (ACL_TYPE_ACCESS); case ACL_TYPE_DEFAULT_OLD: return (ACL_TYPE_DEFAULT); default: return (type); } } /* * These calls wrap the real vnode operations, and are called by the syscall * code once the syscall has converted the path or file descriptor to a vnode * (unlocked). The aclp pointer is assumed still to point to userland, so * this should not be consumed within the kernel except by syscall code. * Other code should directly invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl *inkernelacl; struct mount *mp; int error; AUDIT_ARG_VALUE(type); inkernelacl = acl_alloc(M_WAITOK); error = acl_copyin(aclp, inkernelacl, type); if (error != 0) goto out; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto out; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl); if (error != 0) goto out_unlock; #endif error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); #ifdef MAC out_unlock: #endif VOP_UNLOCK(vp, 0); vn_finished_write(mp); out: acl_free(inkernelacl); return (error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl *inkernelacl; int error; AUDIT_ARG_VALUE(type); inkernelacl = acl_alloc(M_WAITOK | M_ZERO); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_getacl(td->td_ucred, vp, type); if (error != 0) goto out; #endif error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp, 0); if (error == 0) error = acl_copyout(inkernelacl, aclp, type); acl_free(inkernelacl); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) { struct mount *mp; int error; AUDIT_ARG_VALUE(type); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_deleteacl(td->td_ucred, vp, type); if (error != 0) goto out; #endif error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it * * XXXRW: No vnode lock held so can't audit vnode state...? */ static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl *inkernelacl; int error; inkernelacl = acl_alloc(M_WAITOK); error = acl_copyin(aclp, inkernelacl, type); if (error != 0) goto out; error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); out: acl_free(inkernelacl); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't * need to lock, as the vacl_ code will get/release any locks required. */ /* * Given a file path, get an ACL for it */ int sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file path, get an ACL for it; don't follow links. */ int sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file path, set an ACL for it. */ int sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file path, set an ACL for it; don't follow links. */ int sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file descriptor, get an ACL for it. */ int sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init(&rights, CAP_ACL_GET), &fp); if (error == 0) { error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } /* * Given a file descriptor, set an ACL for it. */ int sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init(&rights, CAP_ACL_SET), &fp); if (error == 0) { error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } /* * Given a file path, delete an ACL from it. */ int sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, uap->type); NDFREE(&nd, 0); } return (error); } /* * Given a file path, delete an ACL from it; don't follow links. */ int sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, uap->type); NDFREE(&nd, 0); } return (error); } /* * Given a file path, delete an ACL from it. */ int sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init(&rights, CAP_ACL_DELETE), &fp); if (error == 0) { error = vacl_delete(td, fp->f_vnode, uap->type); fdrop(fp, td); } return (error); } /* * Given a file path, check an ACL for it. */ int sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file path, check an ACL for it; don't follow links. */ int sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } return (error); } /* * Given a file descriptor, check an ACL for it. */ int sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init(&rights, CAP_ACL_CHECK), &fp); if (error == 0) { error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } struct acl * acl_alloc(int flags) { struct acl *aclp; aclp = malloc(sizeof(*aclp), M_ACL, flags); if (aclp == NULL) return (NULL); aclp->acl_maxcnt = ACL_MAX_ENTRIES; return (aclp); } void acl_free(struct acl *aclp) { free(aclp, M_ACL); } Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 326270) +++ head/sys/kern/vfs_aio.c (revision 326271) @@ -1,3003 +1,3005 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */ #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif FEATURE(aio, "Asynchronous I/O"); SYSCTL_DECL(_p1003_1b); static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list"); static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); static int enable_aio_unsafe = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0, "Permit asynchronous IO on all file types, not just known-safe types"); static unsigned int unsafe_warningcnt = 1; SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW, &unsafe_warningcnt, 0, "Warnings that will be triggered upon failed IO requests on unsafe files"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel processes to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel processes for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel processes for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); /* Number of async I/O processes in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process (stored in the process)"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process (stored in the process)"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process (stored in the process)"); /* * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with * vfs.aio.aio_listio_max. */ SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max, CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc, 0, "Maximum aio requests for a single lio_listio call"); #ifdef COMPAT_FREEBSD6 typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; #endif /* * Below is a key of locks used to protect each member of struct kaiocb * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * If the routine that services an AIO request blocks while running in an * AIO kernel process it can starve other I/O requests. BIO requests * queued via aio_qphysio() complete in GEOM and do not use AIO kernel * processes at all. Socket I/O requests use a separate pool of * kprocs and also force non-blocking I/O. Other file I/O requests * use the generic fo_read/fo_write operations which can block. The * fsync and mlock operations can also block while executing. Ideally * none of these requests would block while executing. * * Note that the service routines cannot toggle O_NONBLOCK in the file * structure directly while handling a request due to races with * userland threads. */ /* jobflags */ #define KAIOCB_QUEUEING 0x01 #define KAIOCB_CANCELLED 0x02 #define KAIOCB_CANCELLING 0x04 #define KAIOCB_CHECKSYNC 0x08 #define KAIOCB_CLEARED 0x10 #define KAIOCB_FINISHED 0x20 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aioproc { int aioprocflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ struct proc *aioproc; /* (*) the AIO proc */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) listio flags */ int lioj_finished_count; /* (a) listio flags */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_maxactive_count; /* (*) maximum number of AIOs */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */ int kaio_count; /* (a) size of AIO queue */ int kaio_ballowed_count; /* (*) maximum number of buffers */ int kaio_buffer_count; /* (a) number of physio buffers */ TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */ TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */ TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */ struct task kaio_task; /* (*) task to kick aio processes */ struct task kaio_sync_task; /* (*) task to schedule fsync jobs */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */ /* * Operations used to interact with userland aio control blocks. * Different ABIs provide their own operations. */ struct aiocb_ops { int (*copyin)(struct aiocb *ujob, struct aiocb *kjob); long (*fetch_status)(struct aiocb *ujob); long (*fetch_error)(struct aiocb *ujob); int (*store_status)(struct aiocb *ujob, long status); int (*store_error)(struct aiocb *ujob, long error); int (*store_kernelinfo)(struct aiocb *ujob, long jobref); int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); }; static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; void aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct kaiocb *job); static void aio_process_rw(struct kaiocb *job); static void aio_process_sync(struct kaiocb *job); static void aio_process_mlock(struct kaiocb *job); static void aio_schedule_fsync(void *context, int pending); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lio, int type, struct aiocb_ops *ops); static int aio_queue_file(struct file *fp, struct kaiocb *job); static void aio_physwakeup(struct bio *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qphysio(struct proc *p, struct kaiocb *job); static void aio_daemon(void *param); static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); static bool aio_clear_cancel_function_locked(struct kaiocb *job); static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io process data * aiocb async io jobs * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { .f_isfd = 0, .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, }; static struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, .f_event = filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_kick); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static int aio_onceonly(void) { exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); ki->kaio_flags = 0; ki->kaio_maxactive_count = max_aio_per_proc; ki->kaio_active_count = 0; ki->kaio_qallowed_count = max_aio_queue_per_proc; ki->kaio_count = 0; ki->kaio_ballowed_count = max_buf_aio; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_syncqueue); TAILQ_INIT(&ki->kaio_syncready); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { struct thread *td; int error; error = sigev_findtd(p, sigev, &td); if (error) return (error); if (!KSI_ONQ(ksi)) { ksiginfo_set_sigev(ksi, sigev); ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= KSI_EXT | KSI_INS; tdsendsignal(p, td, ksi->ksi_signo, ksi); } PROC_UNLOCK(p); return (error); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct kaiocb *job) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = job->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(job->jobflags & KAIOCB_FINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, job, plist); TAILQ_REMOVE(&ki->kaio_all, job, allist); lj = job->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* job is going away, we need to destroy any knotes */ knlist_delete(&job->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&job->ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * a kaiocb from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ if (job->fd_file) fdrop(job->fd_file, curthread); crfree(job->cred); uma_zfree(aiocb_zone, job); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } static int aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job) { aio_cancel_fn_t *func; int cancelled; AIO_LOCK_ASSERT(ki, MA_OWNED); if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED)) return (0); MPASS((job->jobflags & KAIOCB_CANCELLING) == 0); job->jobflags |= KAIOCB_CANCELLED; func = job->cancel_fn; /* * If there is no cancel routine, just leave the job marked as * cancelled. The job should be in active use by a caller who * should complete it normally or when it fails to install a * cancel routine. */ if (func == NULL) return (0); /* * Set the CANCELLING flag so that aio_complete() will defer * completions of this job. This prevents the job from being * freed out from under the cancel callback. After the * callback any deferred completion (whether from the callback * or any other source) will be completed. */ job->jobflags |= KAIOCB_CANCELLING; AIO_UNLOCK(ki); func(job); AIO_LOCK(ki); job->jobflags &= ~KAIOCB_CANCELLING; if (job->jobflags & KAIOCB_FINISHED) { cancelled = job->uaiocb._aiocb_private.error == ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(p, job); } else { /* * The cancel callback might have scheduled an * operation to cancel this request, but it is * only counted as cancelled if the request is * cancelled when the callback returns. */ cancelled = 0; } return (cancelled); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct kaiocb *job, *jobn; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { aio_cancel_job(p, ki, job); } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(job); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct kaiocb * aio_selectjob(struct aioproc *aiop) { struct kaiocb *job; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); restart: TAILQ_FOREACH(job, &aio_jobs, list) { userp = job->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < ki->kaio_maxactive_count) { TAILQ_REMOVE(&aio_jobs, job, list); if (!aio_clear_cancel_function(job)) goto restart; /* Account for currently active jobs. */ ki->kaio_active_count++; break; } } return (job); } /* * Move all data to a permanent storage device. This code * simulates the fsync syscall. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); drop: return (error); } /* * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that * does the I/O request for the non-physio version of the operations. The * normal vn operations are used, and this code should work in all instances * for every type of file, including pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process_rw(struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct aiocb *cb; struct file *fp; struct uio auio; struct iovec aiov; ssize_t cnt; long msgsnd_st, msgsnd_end; long msgrcv_st, msgrcv_end; long oublock_st, oublock_end; long inblock_st, inblock_end; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || job->uaiocb.aio_lio_opcode == LIO_WRITE, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); td = curthread; td_savedcred = td->td_ucred; td->td_ucred = job->cred; cb = &job->uaiocb; fp = job->fd_file; aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; msgrcv_st = td->td_ru.ru_msgrcv; msgsnd_st = td->td_ru.ru_msgsnd; inblock_st = td->td_ru.ru_inblock; oublock_st = td->td_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; if (auio.uio_resid == 0) error = 0; else error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); auio.uio_rw = UIO_WRITE; error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); } msgrcv_end = td->td_ru.ru_msgrcv; msgsnd_end = td->td_ru.ru_msgsnd; inblock_end = td->td_ru.ru_inblock; oublock_end = td->td_ru.ru_oublock; job->msgrcv = msgrcv_end - msgrcv_st; job->msgsnd = msgsnd_end - msgsnd_st; job->inblock = inblock_end - inblock_st; job->outblock = oublock_end - oublock_st; if ((error) && (auio.uio_resid != cnt)) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } cnt -= auio.uio_resid; td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, cnt, 0); } static void aio_process_sync(struct kaiocb *job) { struct thread *td = curthread; struct ucred *td_savedcred = td->td_ucred; struct file *fp = job->fd_file; int error = 0; KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); td->td_ucred = job->cred; if (fp->f_vnode != NULL) error = aio_fsync_vnode(td, fp->f_vnode); td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, 0, 0); } static void aio_process_mlock(struct kaiocb *job) { struct aiocb *cb = &job->uaiocb; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); error = kern_mlock(job->userproc, job->cred, __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes); aio_complete(job, error != 0 ? -1 : 0, error); } static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job) { struct aioliojob *lj; struct kaioinfo *ki; struct kaiocb *sjob, *sjobn; int lj_done; bool schedule_fsync; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = job->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist); MPASS(job->jobflags & KAIOCB_FINISHED); if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi); KNOTE_LOCKED(&job->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (job->jobflags & KAIOCB_CHECKSYNC) { schedule_fsync = false; TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) { if (job->fd_file != sjob->fd_file || job->seqno >= sjob->seqno) continue; if (--sjob->pending > 0) continue; TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list); if (!aio_clear_cancel_function_locked(sjob)) continue; TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list); schedule_fsync = true; } if (schedule_fsync) taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_sync_task); } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } static void aio_schedule_fsync(void *context, int pending) { struct kaioinfo *ki; struct kaiocb *job; ki = context; AIO_LOCK(ki); while (!TAILQ_EMPTY(&ki->kaio_syncready)) { job = TAILQ_FIRST(&ki->kaio_syncready); TAILQ_REMOVE(&ki->kaio_syncready, job, list); AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); AIO_LOCK(ki); } AIO_UNLOCK(ki); } bool aio_cancel_cleared(struct kaiocb *job) { struct kaioinfo *ki; /* * The caller should hold the same queue lock held when * aio_clear_cancel_function() was called and set this flag * ensuring this check sees an up-to-date value. However, * there is no way to assert that. */ ki = job->userproc->p_aioinfo; return ((job->jobflags & KAIOCB_CLEARED) != 0); } static bool aio_clear_cancel_function_locked(struct kaiocb *job) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); MPASS(job->cancel_fn != NULL); if (job->jobflags & KAIOCB_CANCELLING) { job->jobflags |= KAIOCB_CLEARED; return (false); } job->cancel_fn = NULL; return (true); } bool aio_clear_cancel_function(struct kaiocb *job) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_clear_cancel_function_locked(job); AIO_UNLOCK(ki); return (ret); } static bool aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); if (job->jobflags & KAIOCB_CANCELLED) return (false); job->cancel_fn = func; return (true); } bool aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_set_cancel_function_locked(job, func); AIO_UNLOCK(ki); return (ret); } void aio_complete(struct kaiocb *job, long status, int error) { struct kaioinfo *ki; struct proc *userp; job->uaiocb._aiocb_private.error = error; job->uaiocb._aiocb_private.status = status; userp = job->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); KASSERT(!(job->jobflags & KAIOCB_FINISHED), ("duplicate aio_complete")); job->jobflags |= KAIOCB_FINISHED; if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) { TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(userp, job); } AIO_UNLOCK(ki); } void aio_cancel(struct kaiocb *job) { aio_complete(job, -1, ECANCELED); } void aio_switch_vmspace(struct kaiocb *job) { vmspace_switch_aio(job->userproc->p_vmspace); } /* * The AIO daemon, most of the actual work is done in aio_process_*, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct kaiocb *job; struct aioproc *aiop; struct kaioinfo *ki; struct proc *p; struct vmspace *myvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = td->td_proc; myvm = vmspace_acquire_ref(p); KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aioproc = p; aiop->aioprocflags = 0; /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * Take daemon off of free queue */ if (aiop->aioprocflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((job = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); ki = job->userproc->p_aioinfo; job->handle_fn(job); mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; } /* * Disconnect from user address space. */ if (p->p_vmspace != myvm) { mtx_unlock(&aio_job_mtx); vmspace_switch_aio(myvm); mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aioprocflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && (aiop->aioprocflags & AIOP_FREE) && num_aio_procs > target_aio_procs) break; } TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); uma_zfree(aiop_zone, aiop); free_unr(aiod_unr, id); vmspace_free(myvm); KASSERT(p->p_vmspace == myvm, ("AIOD: bad vmspace for exiting daemon")); KASSERT(myvm->vm_refcnt > 1, ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt)); kproc_exit(0); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead physio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qphysio(struct proc *p, struct kaiocb *job) { struct aiocb *cb; struct file *fp; struct bio *bp; struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; int error, ref, poff; vm_prot_t prot; cb = &job->uaiocb; fp = job->fd_file; if (fp == NULL || fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; if (vp->v_type != VCHR) return (-1); if (vp->v_bufobj.bo_bsize == 0) return (-1); if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); ref = 0; csw = devvn_refthread(vp, &dev, &ref); if (csw == NULL) return (ENXIO); if ((csw->d_flags & D_DISK) == 0) { error = -1; goto unref; } if (cb->aio_nbytes > dev->si_iosize_max) { error = -1; goto unref; } ki = p->p_aioinfo; poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { if (cb->aio_nbytes > MAXPHYS) { error = -1; goto unref; } pbuf = NULL; } else { if (cb->aio_nbytes > MAXPHYS - poff) { error = -1; goto unref; } if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { error = -1; goto unref; } job->pbuf = pbuf = (struct buf *)getpbuf(NULL); BUF_KERNPROC(pbuf); AIO_LOCK(ki); ki->kaio_buffer_count++; AIO_UNLOCK(ki); } job->bp = bp = g_alloc_bio(); bp->bio_length = cb->aio_nbytes; bp->bio_bcount = cb->aio_nbytes; bp->bio_done = aio_physwakeup; bp->bio_data = (void *)(uintptr_t)cb->aio_buf; bp->bio_offset = cb->aio_offset; bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; bp->bio_dev = dev; bp->bio_caller1 = (void *)job; prot = VM_PROT_READ; if (cb->aio_lio_opcode == LIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages, nitems(job->pages)); if (job->npages < 0) { error = EFAULT; goto doerror; } if (pbuf != NULL) { pmap_qenter((vm_offset_t)pbuf->b_data, job->pages, job->npages); bp->bio_data = pbuf->b_data + poff; atomic_add_int(&num_buf_aio, 1); } else { bp->bio_ma = job->pages; bp->bio_ma_n = job->npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; } /* Perform transfer. */ csw->d_strategy(bp); dev_relthread(dev, ref); return (0); doerror: if (pbuf != NULL) { AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); relpbuf(pbuf, NULL); job->pbuf = NULL; } g_destroy_bio(bp); job->bp = NULL; unref: dev_relthread(dev, ref); return (error); } #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ nsig->sigev_notify = osig->sigev_notify; switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; break; default: return (EINVAL); } return (0); } static int aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb *ojob; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, kjob, sizeof(struct oaiocb)); if (error) return (error); ojob = (struct oaiocb *)kjob; return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); } #endif static int aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) { return (copyin(ujob, kjob, sizeof(struct aiocb))); } static long aiocb_fetch_status(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.status)); } static long aiocb_fetch_error(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.error)); } static int aiocb_store_status(struct aiocb *ujob, long status) { return (suword(&ujob->_aiocb_private.status, status)); } static int aiocb_store_error(struct aiocb *ujob, long error) { return (suword(&ujob->_aiocb_private.error, error)); } static int aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) { return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); } static int aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword(ujobp, (long)ujob)); } static struct aiocb_ops aiocb_ops = { .copyin = aiocb_copyin, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb_ops_osigevent = { .copyin = aiocb_copyin_old_sigevent, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #endif /* * Queue a new AIO request. Choosing either the threaded or direct physio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, int type, struct aiocb_ops *ops) { struct proc *p = td->td_proc; cap_rights_t rights; struct file *fp; struct kaiocb *job; struct kaioinfo *ki; struct kevent kev; int opcode; int error; int fd, kqfd; int jid; u_short evflags; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; ops->store_status(ujob, -1); ops->store_error(ujob, 0); ops->store_kernelinfo(ujob, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= ki->kaio_qallowed_count) { ops->store_error(ujob, EAGAIN); return (EAGAIN); } job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); knlist_init_mtx(&job->klist, AIO_MTX(ki)); error = ops->copyin(ujob, &job->uaiocb); if (error) { ops->store_error(ujob, error); uma_zfree(aiocb_zone, job); return (error); } if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { uma_zfree(aiocb_zone, job); return (EINVAL); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { ops->store_error(ujob, EINVAL); uma_zfree(aiocb_zone, job); return (EINVAL); } if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { uma_zfree(aiocb_zone, job); return (EINVAL); } ksiginfo_init(&job->ksi); /* Save userspace address of the job info. */ job->ujob = ujob; /* Get the opcode. */ if (type != LIO_NOP) job->uaiocb.aio_lio_opcode = type; opcode = job->uaiocb.aio_lio_opcode; /* * Validate the opcode and fetch the file object for the specified * file descriptor. * * XXXRW: Moved the opcode validation up here so that we don't * retrieve a file descriptor without knowing what the capabiltity * should be. */ fd = job->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); break; case LIO_READ: error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp); break; case LIO_SYNC: error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp); break; case LIO_MLOCK: fp = NULL; break; case LIO_NOP: error = fget(td, fd, cap_rights_init(&rights), &fp); break; default: error = EINVAL; } if (error) { uma_zfree(aiocb_zone, job); ops->store_error(ujob, error); return (error); } if (opcode == LIO_SYNC && fp->f_vnode == NULL) { error = EINVAL; goto aqueue_fail; } if ((opcode == LIO_READ || opcode == LIO_WRITE) && job->uaiocb.aio_offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { error = EINVAL; goto aqueue_fail; } job->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; job->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = ops->store_kernelinfo(ujob, jid); if (error) { error = EINVAL; goto aqueue_fail; } job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); uma_zfree(aiocb_zone, job); return (0); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags; if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { error = EINVAL; goto aqueue_fail; } kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue; kev.ident = (uintptr_t)job->ujob; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; kev.data = (intptr_t)job; kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, 1); if (error) goto aqueue_fail; no_kqueue: ops->store_error(ujob, EINPROGRESS); job->uaiocb._aiocb_private.error = EINPROGRESS; job->userproc = p; job->cred = crhold(td->td_ucred); job->jobflags = KAIOCB_QUEUEING; job->lio = lj; if (opcode == LIO_MLOCK) { aio_schedule(job, aio_process_mlock); error = 0; } else if (fp->f_ops->fo_aio_queue == NULL) error = aio_queue_file(fp, job); else error = fo_aio_queue(fp, job); if (error) goto aqueue_fail; AIO_LOCK(ki); job->jobflags &= ~KAIOCB_QUEUEING; TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist); ki->kaio_count++; if (lj) lj->lioj_count++; atomic_add_int(&num_queue_count, 1); if (job->jobflags & KAIOCB_FINISHED) { /* * The queue callback completed the request synchronously. * The bulk of the completion is deferred in that case * until this point. */ aio_bio_done_notify(p, job); } else TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist); AIO_UNLOCK(ki); return (0); aqueue_fail: knlist_delete(&job->klist, curthread, 0); if (fp) fdrop(fp, td); uma_zfree(aiocb_zone, job); ops->store_error(ujob, error); return (error); } static void aio_cancel_daemon_job(struct kaiocb *job) { mtx_lock(&aio_job_mtx); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&aio_jobs, job, list); mtx_unlock(&aio_job_mtx); aio_cancel(job); } void aio_schedule(struct kaiocb *job, aio_handle_fn_t *func) { mtx_lock(&aio_job_mtx); if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) { mtx_unlock(&aio_job_mtx); aio_cancel(job); return; } job->handle_fn = func; TAILQ_INSERT_TAIL(&aio_jobs, job, list); aio_kick_nowait(job->userproc); mtx_unlock(&aio_job_mtx); } static void aio_cancel_sync(struct kaiocb *job) { struct kaioinfo *ki; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); aio_cancel(job); } int aio_queue_file(struct file *fp, struct kaiocb *job) { struct aioliojob *lj; struct kaioinfo *ki; struct kaiocb *job2; struct vnode *vp; struct mount *mp; int error, opcode; bool safe; lj = job->lio; ki = job->userproc->p_aioinfo; opcode = job->uaiocb.aio_lio_opcode; if (opcode == LIO_SYNC) goto queueit; if ((error = aio_qphysio(job->userproc, job)) == 0) goto done; #if 0 /* * XXX: This means qphysio() failed with EFAULT. The current * behavior is to retry the operation via fo_read/fo_write. * Wouldn't it be better to just complete the request with an * error here? */ if (error > 0) goto done; #endif queueit: safe = false; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VREG || vp->v_type == VDIR) { mp = fp->f_vnode->v_mount; if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0) safe = true; } } if (!(safe || enable_aio_unsafe)) { counted_warning(&unsafe_warningcnt, "is attempting to use unsafe AIO requests"); return (EOPNOTSUPP); } if (opcode == LIO_SYNC) { AIO_LOCK(ki); TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) { if (job2->fd_file == job->fd_file && job2->uaiocb.aio_lio_opcode != LIO_SYNC && job2->seqno < job->seqno) { job2->jobflags |= KAIOCB_CHECKSYNC; job->pending++; } } if (job->pending != 0) { if (!aio_set_cancel_function_locked(job, aio_cancel_sync)) { AIO_UNLOCK(ki); aio_cancel(job); return (0); } TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); return (0); } AIO_UNLOCK(ki); } switch (opcode) { case LIO_READ: case LIO_WRITE: aio_schedule(job, aio_process_rw); error = 0; break; case LIO_SYNC: aio_schedule(job, aio_process_sync); error = 0; break; default: error = EINVAL; } done: return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < ki->kaio_maxactive_count) { taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < ki->kaio_maxactive_count) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ static int kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; long status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_done, plist) { if (job->ujob == ujob) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_error(ujob, error); ops->store_status(ujob, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } int sys_aio_return(struct thread *td, struct aio_return_args *uap) { return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ static int kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, struct timespec *ts) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *firstjob, *job; int error, i, timo; timo = 0; if (ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); if (njoblist == 0) return (0); AIO_LOCK(ki); for (;;) { firstjob = NULL; error = 0; TAILQ_FOREACH(job, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (job->ujob == ujoblist[i]) { if (firstjob == NULL) firstjob = job; if (job->jobflags & KAIOCB_FINISHED) goto RETURN; } } } /* All tasks were finished. */ if (firstjob == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); return (error); } int sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct timespec ts, *tsp; struct aiocb **ujoblist; int error; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); if (error == 0) error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); free(ujoblist, M_AIOS); return (error); } /* * aio_cancel cancels any non-physio aio operations not currently in * progress. */ int sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct kaiocb *job, *jobn; struct file *fp; cap_rights_t rights; int error; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ error = fget(td, uap->fd, cap_rights_init(&rights), &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp, &error)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { if ((uap->fd == job->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == job->ujob))) { if (aio_cancel_job(p, ki, job)) { cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes * only. For a user mode async implementation, it would be best to do it in * a userland subroutine. */ static int kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_all, allist) { if (job->ujob == ujob) { if (job->jobflags & KAIOCB_FINISHED) td->td_retval[0] = job->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = ops->fetch_status(ujob); if (status == -1) { td->td_retval[0] = ops->fetch_error(ujob); return (0); } td->td_retval[0] = EINVAL; return (0); } int sys_aio_error(struct thread *td, struct aio_error_args *uap) { return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); } /* syscall - asynchronous read from a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb_ops_osigevent)); } #endif int sys_aio_read(struct thread *td, struct aio_read_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); } /* syscall - asynchronous write to a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops_osigevent)); } #endif int sys_aio_write(struct thread *td, struct aio_write_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); } int sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); } static int kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, struct aiocb **acb_list, int nent, struct sigevent *sig, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocb *job; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int error; int nerror; int i; if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) return (EINVAL); if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; knlist_init_mtx(&lj->klist, AIO_MTX(ki)); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (sig && (mode == LIO_NOWAIT)) { bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uacb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nerror = 0; for (i = 0; i < nent; i++) { job = acb_list[i]; if (job != NULL) { error = aio_aqueue(td, job, lj, LIO_NOP, ops); if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); return (error); } /* syscall - list directed I/O (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent osig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif /* syscall - list directed I/O (REALTIME) */ int sys_lio_listio(struct thread *td, struct lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig, sizeof(sig)); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, nent, sigp, &aiocb_ops); free(acb_list, M_LIO); return (error); } static void aio_physwakeup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; struct proc *userp; struct kaioinfo *ki; size_t nbytes; int error, nblks; /* Release mapping into kernel space. */ userp = job->userproc; ki = userp->p_aioinfo; if (job->pbuf) { pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages); relpbuf(job->pbuf, NULL); job->pbuf = NULL; atomic_subtract_int(&num_buf_aio, 1); AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); } vm_page_unhold_pages(job->pages, job->npages); bp = job->bp; job->bp = NULL; nbytes = job->uaiocb.aio_nbytes - bp->bio_resid; error = 0; if (bp->bio_flags & BIO_ERROR) error = bp->bio_error; nblks = btodb(nbytes); if (job->uaiocb.aio_lio_opcode == LIO_WRITE) job->outblock += nblks; else job->inblock += nblks; if (error) aio_complete(job, -1, error); else aio_complete(job, nbytes, 0); g_destroy_bio(bp); } /* syscall - wait for the next completion of an aio request */ static int kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, struct timespec *ts, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *job; struct aiocb *ujob; long error, status; int timo; ops->store_aiocb(ujobp, NULL); if (ts == NULL) { timo = 0; } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { timo = -1; } else { if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; job = NULL; AIO_LOCK(ki); while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) { if (timo == -1) { error = EWOULDBLOCK; break; } ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); ujob = job->ujob; status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_aiocb(ujobp, ujob); ops->store_error(ujob, error); ops->store_status(ujob, status); } else AIO_UNLOCK(ki); return (error); } int sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); } static int kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob, struct aiocb_ops *ops) { if (op != O_SYNC) /* XXX lack of O_DSYNC */ return (EINVAL); return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops)); } int sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct kaiocb *job; job = (struct kaiocb *)(uintptr_t)kn->kn_sdata; /* * The job pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_aio = job; kn->kn_flags &= ~EV_FLAG1; knlist_add(&job->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_aio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct kaiocb *job = kn->kn_ptr.p_aio; kn->kn_data = job->uaiocb._aiocb_private.error; if (!(job->jobflags & KAIOCB_FINISHED)) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob *lj; lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_lio = lj; kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_lio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = kn->kn_ptr.p_lio; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include #include struct __aiocb_private32 { int32_t status; int32_t error; uint32_t kernelinfo; }; #ifdef COMPAT_FREEBSD6 typedef struct oaiocb32 { int aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent32 aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; } oaiocb32_t; #endif typedef struct aiocb32 { int32_t aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ int __spare__[2]; uint32_t __spare2__; int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; struct sigevent32 aio_sigevent; /* Signal to deliver */ } aiocb32_t; #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ CP(*osig, *nsig, sigev_notify); switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); break; default: return (EINVAL); } return (0); } static int aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb32 job32; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_old_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } #endif static int aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) { struct aiocb32 job32; int error; error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } static long aiocb32_fetch_status(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.status)); } static long aiocb32_fetch_error(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.error)); } static int aiocb32_store_status(struct aiocb *ujob, long status) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.status, status)); } static int aiocb32_store_error(struct aiocb *ujob, long error) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.error, error)); } static int aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); } static int aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword32(ujobp, (long)ujob)); } static struct aiocb_ops aiocb32_ops = { .copyin = aiocb32_copyin, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb32_ops_osigevent = { .copyin = aiocb32_copyin_old_sigevent, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #endif int freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) { return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; struct aiocb **ujoblist; uint32_t *ujoblist32; int error, i; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); ujoblist32 = (uint32_t *)ujoblist; error = copyin(uap->aiocbp, ujoblist32, uap->nent * sizeof(ujoblist32[0])); if (error == 0) { for (i = uap->nent - 1; i >= 0; i--) ujoblist[i] = PTRIN(ujoblist32[i]); error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); } free(ujoblist, M_AIOS); return (error); } int freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) { return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_read(struct thread *td, struct freebsd6_freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_write(struct thread *td, struct freebsd6_freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops)); } int freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, &aiocb32_ops)); } int freebsd32_aio_waitcomplete(struct thread *td, struct freebsd32_aio_waitcomplete_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts32, sizeof(ts32)); if (error) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, &aiocb32_ops)); } int freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_lio_listio(struct thread *td, struct freebsd6_freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent32 osig; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent32(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif int freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct sigevent32 sig32; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig32, sizeof(sig32)); if (error) return (error); error = convert_sigevent32(&sig32, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops); free(acb_list, M_LIO); return (error); } #endif Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 326270) +++ head/sys/kern/vfs_bio.c (revision 326271) @@ -1,5053 +1,5055 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_compat.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, int); static int buf_recycle(bool); static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); #endif int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static long bufspace; #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); #else SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Physical memory used for buffers"); #endif static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "Target number of free buffers"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer acquisition"); static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static int numbufallocfails; SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static long notbufdflushes; SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign __exclusive_cache_line bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct rwlock_padalign __exclusive_cache_line nblock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request/wakeup point for the bufspace daemon. */ static int bufspace_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Maximum number of clean buffer queues. */ #define CLEAN_QUEUES 16 /* Configured number of clean queues. */ static int clean_queues; /* Maximum number of buffer queues. */ #define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif /* * Lock for each bufqueue */ static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES]; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, arg1, arg2, req)); lvalue = *(long *)arg1; if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #endif static int bqcleanq(void) { static int nextq; return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); } static int bqisclean(int qindex) { return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); } /* * bqlock: * * Return the appropriate queue lock based on the index. */ static inline struct mtx * bqlock(int qindex) { return (struct mtx *)&bqlocks[qindex]; } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(void) { if (atomic_fetchadd_int(&numdirtybuffers, -1) == (lodirtybuffers + hidirtybuffers) / 2) bdirtywakeup(); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(void) { /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ if (atomic_fetchadd_int(&numdirtybuffers, 1) == (lodirtybuffers + hidirtybuffers) / 2) bd_wakeup(); } /* * bufspace_wakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static void bufspace_wakeup(void) { /* * If someone is waiting for bufspace, wake them up. * * Since needsbuffer is set prior to doing an additional queue * scan it is safe to check for the flag prior to acquiring the * lock. The thread that is preparing to scan again before * blocking would discover the buf we released. */ if (needsbuffer) { rw_rlock(&nblock); if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } } /* * bufspace_daemonwakeup: * * Wakeup the daemon responsible for freeing clean bufs. */ static void bufspace_daemonwakeup(void) { rw_rlock(&nblock); if (bufspace_request == 0) { bufspace_request = 1; wakeup(&bufspace_request); } rw_runlock(&nblock); } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); bufspace_wakeup(); } else { space = atomic_fetchadd_long(&bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + diff >= bufspacethresh) bufspace_daemonwakeup(); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(int size, bool metadata) { long limit; long space; if (metadata) limit = maxbufspace; else limit = hibufspace; do { space = bufspace; if (space + size > limit) return (ENOSPC); } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + size >= bufspacethresh) bufspace_daemonwakeup(); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(int size) { atomic_subtract_long(&bufspace, size); bufspace_wakeup(); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. needsbuffer must be set in a safe fashion prior to * polling for space. The operation must be re-tried on return. */ static void bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; rw_wlock(&nblock); while (needsbuffer != 0) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { rw_wunlock(&nblock); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; rw_wlock(&nblock); if (fl != 0) continue; if (needsbuffer == 0) break; } error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } rw_wunlock(&nblock); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void) { for (;;) { kproc_suspend_check(bufspacedaemonproc); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bufspace > lobufspace || numfreebuffers < hifreebuffers) { if (buf_recycle(false) != 0) { atomic_set_int(&needsbuffer, 1); if (buf_recycle(false) != 0) { rw_wlock(&nblock); if (needsbuffer) rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, PRIBIO|PDROP, "bufspace", hz/10); else rw_wunlock(&nblock); } } maybe_yield(); } /* * Re-check our limits under the exclusive nblock. */ rw_wlock(&nblock); if (bufspace < bufspacethresh && numfreebuffers > lofreebuffers) { bufspace_request = 0; rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, "-", hz); } else rw_wunlock(&nblock); } } static struct kproc_desc bufspace_kp = { "bufspacedaemon", bufspace_daemon, &bufspacedaemonproc }; SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &bufspace_kp); /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > MAXPHYS) maxbcachebuf = MAXPHYS; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = min(nbuf / 4, 256); TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); #ifdef INVARIANTS bq_len[QUEUE_EMPTY]++; #endif } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); #ifdef PREEMPTION /* * Drop Giant and spin for a while to allow * interrupt threads to run. */ DROP_GIANT(); DELAY(50000 * iter); PICKUP_GIANT(); #else /* * Drop Giant and context switch several times to * allow interrupt threads to run. */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL, NULL); thread_unlock(curthread); DELAY(1000); } PICKUP_GIANT(); #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == NULL) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* * binsfree: * * Insert the buffer into the appropriate free list. */ static void binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; if (qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } /* * Stick to the same clean queue for the lifetime of the buf to * limit locking below. Otherwise pick ont sequentially. */ if (qindex == QUEUE_CLEAN) { if (bqisclean(bp->b_qindex)) qindex = bp->b_qindex; else qindex = bqcleanq(); } /* * Handle delayed bremfree() processing. */ nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); bremfreel(bp); if (olock != nlock) { mtx_unlock(olock); mtx_lock(nlock); } } else mtx_lock(nlock); if (bp->b_qindex != QUEUE_NONE) panic("binsfree: free buffer onto another queue???"); bp->b_qindex = qindex; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); atomic_add_int(&numfreebuffers, 1); bufspace_wakeup(); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int flags) { struct buf *bp; int i; mtx_lock(&bqlocks[QUEUE_EMPTY]); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); if (bp == NULL) break; bremfreel(bp); store[i] = bp; } mtx_unlock(&bqlocks[QUEUE_EMPTY]); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { int i; for (i = 0; i < cnt; i++) binsfree(store[i], QUEUE_EMPTY); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(void) { struct buf *bp; bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { bufspace_daemonwakeup(); atomic_add_int(&numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition. */ if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) bufspace_daemonwakeup(); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_qrecycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_qrecycle(int qindex, bool kva) { struct buf *bp, *nbp; if (kva) atomic_add_int(&bufdefragcnt, 1); nbp = NULL; mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bremfreel(bp); mtx_unlock(&bqlocks[qindex]); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } mtx_unlock(&bqlocks[qindex]); return (ENOBUFS); } /* * buf_recycle: * * Iterate through all clean queues until we find a buf to recycle or * exhaust the search. */ static int buf_recycle(bool kva) { int qindex, first_qindex; qindex = first_qindex = bqcleanq(); do { if (buf_qrecycle(qindex, kva) == 0) return (0); if (++qindex == QUEUE_CLEAN + clean_queues) qindex = QUEUE_CLEAN; } while (qindex != first_qindex); return (ENOBUFS); } /* * buf_scan: * * Scan the clean queues looking for a buffer to recycle. needsbuffer * is set on failure so that the caller may optionally bufspace_wait() * in a race-free fashion. */ static int buf_scan(bool defrag) { int error; /* * To avoid heavy synchronization and wakeup races we set * needsbuffer and re-poll before failing. This ensures that * no frees can be missed between an unsuccessful poll and * going to sleep in a synchronized fashion. */ if ((error = buf_recycle(defrag)) != 0) { atomic_set_int(&needsbuffer, 1); bufspace_daemonwakeup(); error = buf_recycle(defrag); } if (error == 0) atomic_add_int(&getnewbufrestarts, 1); return (error); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct mtx *qlock; qlock = bqlock(bp->b_qindex); mtx_lock(qlock); bremfreel(bp); mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", bp->b_qindex)); bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~B_REMFREE; } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); atomic_subtract_long(&bufkvaspace, bp->b_kvasize); atomic_add_int(&buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; atomic_add_long(&bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { int i; for (i = 0; i < 5; i++) if (buf_scan(true) != 0) break; return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ static void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) != 0) { brelse(rabp); continue; } if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { rabp->b_flags |= B_CKHASH; rabp->b_ckhashcalc = ckhashfunc; } rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; int readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); /* * Can only return NULL if GB_LOCK_NOWAIT flag is specified. */ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); if (bp == NULL) return (EBUSY); /* * If not found in cache, do some I/O */ readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { bp->b_flags |= B_CKHASH; bp->b_ckhashcalc = ckhashfunc; } bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. */ breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); rv = 0; if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) barrierwrites++; oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; } if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are treated as being * transient. * * XXX Treating EIO as transient is not correct; the * contract with the local storage device drivers is that * they will only return EIO once the I/O is no longer * retriable. Network I/O also respects this through the * guarantees of TCP and/or the internal retries of NFS. * ENOMEM might be transient, but we also have no way of * knowing when its ok to retry/reschedule. In general, * this entire case should be made obsolete through better * error handling/recovery and resource scheduling. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Unwire a page held by a buf and place it on the appropriate vm queue. */ static void vfs_vmio_unwire(struct buf *bp, vm_page_t m) { bool freed; vm_page_lock(m); if (vm_page_unwire(m, PQ_NONE)) { /* * Determine if the page should be freed before adding * it to the inactive queue. */ if (m->valid == 0) { freed = !vm_page_busied(m); if (freed) vm_page_free(m); } else if ((bp->b_flags & B_DIRECT) != 0) freed = vm_page_try_to_free(m); else freed = false; if (!freed) { /* * If the page is unlikely to be reused, let the * VM know. Otherwise, maintain LRU page * ordering and put the page at the tail of the * inactive queue. */ if ((bp->b_flags & B_NOREUSE) != 0) vm_page_deactivate_noreuse(m); else vm_page_deactivate(m); } } vm_page_unlock(m); } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh", true); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vfs_vmio_unwire(bp, m); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); obj = bp->b_bufobj->bo_object; if (obj != NULL) VM_OBJECT_WLOCK(obj); for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; vfs_vmio_unwire(bp, m); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); if (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ (void)vm_page_grab_pages(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); bp->b_npages = desiredpages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; atomic_add_int(&getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(maxsize, metadata) != 0) continue; reserved = true; if ((bp = buf_alloc()) == NULL) continue; if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while(buf_scan(false) == 0); if (reserved) atomic_subtract_long(&bufspace, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, int target) { int flushed; flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, target, 1); } return (flushed); } static void buf_daemon() { int lodirty; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); lodirty = lodirtybuffers; if (bd_speedupreq) { lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. */ while (numdirtybuffers > lodirty) { if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int queue; int error; bool unlock; flushed = 0; queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; mtx_lock(&bqlocks[queue]); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); while (flushed != target) { maybe_yield(); mtx_lock(&bqlocks[queue]); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { mtx_unlock(&bqlocks[queue]); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { mtx_unlock(&bqlocks[queue]); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqlocks[queue]); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); notbufdflushes++; } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_lock(&bqlocks[queue]); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } atomic_add_int(&mappingrestarts, 1); bufspace_wait(bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); bufspace_release(maxsize); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; BUF_ASSERT_HELD(bp); if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; else if ((bp->b_flags & B_CKHASH) != 0) { KASSERT(buf_mapped(bp), ("biodone: bp %p not mapped", bp)); (*bp->b_ckhashcalc)(bp); } /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { BUF_ASSERT_HELD(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage", true); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | VM_ALLOC_WAITOK); pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; p->wire_count--; vm_page_free(p); } atomic_subtract_int(&vm_cnt.v_wire_count, bp->b_npages - newnpages); bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bsize; int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; VM_OBJECT_WLOCK(object); again: for (i = 0; i < count; i++) vm_page_busy_downgrade(ma[i]); VM_OBJECT_WUNLOCK(object); lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (m->valid == VM_PAGE_BITS_ALL) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; bsize = get_blksize(vp, lbn); error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || m->valid != VM_PAGE_BITS_ALL) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || m->valid == VM_PAGE_BITS_ALL || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { VM_OBJECT_WLOCK(object); if (m->valid != 0 && m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, TRUE); VM_OBJECT_WUNLOCK(object); } next_page:; } end_pages: VM_OBJECT_WLOCK(object); redo = false; for (i = 0; i < count; i++) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (ma[i]->valid != VM_PAGE_BITS_ALL) redo = true; } if (redo && error == 0) goto again; VM_OBJECT_WUNLOCK(object); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: head/sys/kern/vfs_extattr.c =================================================================== --- head/sys/kern/vfs_extattr.c (revision 326270) +++ head/sys/kern/vfs_extattr.c (revision 326271) @@ -1,765 +1,767 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1999-2001 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Syscall to push extended attribute configuration information into the VFS. * Accepts a path, which it converts to a mountpoint, as well as a command * (int cmd), and attribute name and misc data. * * Currently this is used only by UFS1 extended attributes. */ int sys_extattrctl(td, uap) struct thread *td; struct extattrctl_args /* { const char *path; int cmd; const char *filename; int attrnamespace; const char *attrname; } */ *uap; { struct vnode *filename_vp; struct nameidata nd; struct mount *mp, *mp_writable; char attrname[EXTATTR_MAXNAMELEN]; int error; AUDIT_ARG_CMD(uap->cmd); AUDIT_ARG_VALUE(uap->attrnamespace); /* * uap->attrname is not always defined. We check again later when we * invoke the VFS call so as to pass in NULL there if needed. */ if (uap->attrname != NULL) { error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); } AUDIT_ARG_TEXT(attrname); mp = NULL; filename_vp = NULL; if (uap->filename != NULL) { NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2, UIO_USERSPACE, uap->filename, td); error = namei(&nd); if (error) return (error); filename_vp = nd.ni_vp; NDFREE(&nd, NDF_NO_VP_RELE); } /* uap->path is always defined. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) goto out; mp = nd.ni_vp->v_mount; error = vfs_busy(mp, 0); if (error) { NDFREE(&nd, 0); mp = NULL; goto out; } VOP_UNLOCK(nd.ni_vp, 0); error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH); NDFREE(&nd, NDF_NO_VP_UNLOCK); if (error) goto out; if (filename_vp != NULL) { /* * uap->filename is not always defined. If it is, * grab a vnode lock, which VFS_EXTATTRCTL() will * later release. */ error = vn_lock(filename_vp, LK_EXCLUSIVE); if (error) { vn_finished_write(mp_writable); goto out; } } error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace, uap->attrname != NULL ? attrname : NULL); vn_finished_write(mp_writable); out: if (mp != NULL) vfs_unbusy(mp); /* * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp, * so vrele it if it is defined. */ if (filename_vp != NULL) vrele(filename_vp); return (error); } /*- * Set a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct mount *mp; struct uio auio; struct iovec aiov; ssize_t cnt; int error; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > IOSIZE_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; cnt = nbytes; #ifdef MAC error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; done: VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } int sys_extattr_set_fd(td, uap) struct thread *td; struct extattr_set_fd_args /* { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_EXTATTR_SET), &fp); if (error) return (error); error = extattr_set_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int sys_extattr_set_file(td, uap) struct thread *td; struct extattr_set_file_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int sys_extattr_set_link(td, uap) struct thread *td; struct extattr_set_link_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } /*- * Get a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; struct iovec aiov; ssize_t cnt; size_t size, *sizep; int error; vn_lock(vp, LK_SHARED | LK_RETRY); /* * Slightly unusual semantics: if the user provides a NULL data * pointer, they don't want to receive the data, just the maximum * read length. */ auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > IOSIZE_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; done: VOP_UNLOCK(vp, 0); return (error); } int sys_extattr_get_fd(td, uap) struct thread *td; struct extattr_get_fd_args /* { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_EXTATTR_GET), &fp); if (error) return (error); error = extattr_get_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int sys_extattr_get_file(td, uap) struct thread *td; struct extattr_get_file_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int sys_extattr_get_link(td, uap) struct thread *td; struct extattr_get_link_args /* { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } /* * extattr_delete_vp(): Delete a named extended attribute on a file or * directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred, td); #ifdef MAC done: #endif VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } int sys_extattr_delete_fd(td, uap) struct thread *td; struct extattr_delete_fd_args /* { int fd; int attrnamespace; const char *attrname; } */ *uap; { struct file *fp; char attrname[EXTATTR_MAXNAMELEN]; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp); if (error) return (error); error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace, attrname, td); fdrop(fp, td); return (error); } int sys_extattr_delete_file(td, uap) struct thread *td; struct extattr_delete_file_args /* { const char *path; int attrnamespace; const char *attrname; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); vrele(nd.ni_vp); return(error); } int sys_extattr_delete_link(td, uap) struct thread *td; struct extattr_delete_link_args /* { const char *path; int attrnamespace; const char *attrname; } */ *uap; { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN]; int error; AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); if (error) return(error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return(error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td); vrele(nd.ni_vp); return(error); } /*- * Retrieve a list of extended attributes on a file or directory. * * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace", * userspace buffer pointer "data", buffer length "nbytes", * thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; size_t size, *sizep; struct iovec aiov; ssize_t cnt; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; if (nbytes > IOSIZE_MAX) { error = EINVAL; goto done; } auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace); if (error) goto done; #endif error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; done: VOP_UNLOCK(vp, 0); return (error); } int sys_extattr_list_fd(td, uap) struct thread *td; struct extattr_list_fd_args /* { int fd; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp); if (error) return (error); error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } int sys_extattr_list_file(td, uap) struct thread*td; struct extattr_list_file_args /* { const char *path; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; int error; AUDIT_ARG_VALUE(uap->attrnamespace); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } int sys_extattr_list_link(td, uap) struct thread*td; struct extattr_list_link_args /* { const char *path; int attrnamespace; void *data; size_t nbytes; } */ *uap; { struct nameidata nd; int error; AUDIT_ARG_VALUE(uap->attrnamespace); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data, uap->nbytes, td); vrele(nd.ni_vp); return (error); } Index: head/sys/kern/vfs_hash.c =================================================================== --- head/sys/kern/vfs_hash.c (revision 326270) +++ head/sys/kern/vfs_hash.c (revision 326271) @@ -1,232 +1,234 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table"); static LIST_HEAD(vfs_hash_head, vnode) *vfs_hash_tbl; static LIST_HEAD(,vnode) vfs_hash_side; static u_long vfs_hash_mask; static struct rwlock vfs_hash_lock; static void vfs_hashinit(void *dummy __unused) { vfs_hash_tbl = hashinit(desiredvnodes, M_VFS_HASH, &vfs_hash_mask); rw_init(&vfs_hash_lock, "vfs hash"); LIST_INIT(&vfs_hash_side); } /* Must be SI_ORDER_SECOND so desiredvnodes is available */ SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL); u_int vfs_hash_index(struct vnode *vp) { return (vp->v_hash + vp->v_mount->mnt_hashseed); } static struct vfs_hash_head * vfs_hash_bucket(const struct mount *mp, u_int hash) { return (&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]); } int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg) { struct vnode *vp; int error; while (1) { rw_rlock(&vfs_hash_lock); LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) { if (vp->v_hash != hash) continue; if (vp->v_mount != mp) continue; if (fn != NULL && fn(vp, arg)) continue; vhold(vp); rw_runlock(&vfs_hash_lock); error = vget(vp, flags | LK_VNHELD, td); if (error == ENOENT && (flags & LK_NOWAIT) == 0) break; if (error) return (error); *vpp = vp; return (0); } if (vp == NULL) { rw_runlock(&vfs_hash_lock); *vpp = NULL; return (0); } } } void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg) { struct vnode *vp; while (1) { rw_rlock(&vfs_hash_lock); LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) { if (vp->v_hash != hash) continue; if (vp->v_mount != mp) continue; if (fn != NULL && fn(vp, arg)) continue; vhold(vp); rw_runlock(&vfs_hash_lock); vref(vp); vdrop(vp); *vpp = vp; return; } if (vp == NULL) { rw_runlock(&vfs_hash_lock); *vpp = NULL; return; } } } void vfs_hash_remove(struct vnode *vp) { rw_wlock(&vfs_hash_lock); LIST_REMOVE(vp, v_hashlist); rw_wunlock(&vfs_hash_lock); } int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg) { struct vnode *vp2; int error; *vpp = NULL; while (1) { rw_wlock(&vfs_hash_lock); LIST_FOREACH(vp2, vfs_hash_bucket(vp->v_mount, hash), v_hashlist) { if (vp2->v_hash != hash) continue; if (vp2->v_mount != vp->v_mount) continue; if (fn != NULL && fn(vp2, arg)) continue; vhold(vp2); rw_wunlock(&vfs_hash_lock); error = vget(vp2, flags | LK_VNHELD, td); if (error == ENOENT && (flags & LK_NOWAIT) == 0) break; rw_wlock(&vfs_hash_lock); LIST_INSERT_HEAD(&vfs_hash_side, vp, v_hashlist); rw_wunlock(&vfs_hash_lock); vput(vp); if (!error) *vpp = vp2; return (error); } if (vp2 == NULL) break; } vp->v_hash = hash; LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist); rw_wunlock(&vfs_hash_lock); return (0); } void vfs_hash_rehash(struct vnode *vp, u_int hash) { rw_wlock(&vfs_hash_lock); LIST_REMOVE(vp, v_hashlist); LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist); vp->v_hash = hash; rw_wunlock(&vfs_hash_lock); } void vfs_hash_changesize(int newmaxvnodes) { struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl; u_long vfs_hash_newmask, vfs_hash_oldmask; struct vnode *vp; int i; vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH, &vfs_hash_newmask); /* If same hash table size, nothing to do */ if (vfs_hash_mask == vfs_hash_newmask) { free(vfs_hash_newtbl, M_VFS_HASH); return; } /* * Move everything from the old hash table to the new table. * None of the vnodes in the table can be recycled because to * do so, they have to be removed from the hash table. */ rw_wlock(&vfs_hash_lock); vfs_hash_oldtbl = vfs_hash_tbl; vfs_hash_oldmask = vfs_hash_mask; vfs_hash_tbl = vfs_hash_newtbl; vfs_hash_mask = vfs_hash_newmask; for (i = 0; i <= vfs_hash_oldmask; i++) { while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) { LIST_REMOVE(vp, v_hashlist); LIST_INSERT_HEAD( vfs_hash_bucket(vp->v_mount, vp->v_hash), vp, v_hashlist); } } rw_wunlock(&vfs_hash_lock); free(vfs_hash_oldtbl, M_VFS_HASH); } Index: head/sys/libkern/arm/aeabi_unwind.c =================================================================== --- head/sys/libkern/arm/aeabi_unwind.c (revision 326270) +++ head/sys/libkern/arm/aeabi_unwind.c (revision 326271) @@ -1,59 +1,61 @@ -/* +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (C) 2013 Andrew Turner * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #else #define panic(x) (void)0 #endif /* We need to provide these functions never call them */ void __aeabi_unwind_cpp_pr0(void); void __aeabi_unwind_cpp_pr1(void); void __aeabi_unwind_cpp_pr2(void); void __aeabi_unwind_cpp_pr0(void) { panic("__aeabi_unwind_cpp_pr0"); } void __aeabi_unwind_cpp_pr1(void) { panic("__aeabi_unwind_cpp_pr1"); } void __aeabi_unwind_cpp_pr2(void) { panic("__aeabi_unwind_cpp_pr2"); } Index: head/sys/libkern/arm/ldivmod_helper.c =================================================================== --- head/sys/libkern/arm/ldivmod_helper.c (revision 326270) +++ head/sys/libkern/arm/ldivmod_helper.c (revision 326271) @@ -1,49 +1,51 @@ /* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (C) 2012 Andrew Turner * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include /* * Helper for __aeabi_ldivmod. * TODO: __divdi3 calls __qdivrem. We should do the same and use the * remainder value rather than re-calculating it. */ long long __kern_ldivmod(long long, long long, long long *); long long __kern_ldivmod(long long n, long long m, long long *rem) { long long q; q = __divdi3(n, m); /* q = n / m */ *rem = n - m * q; return q; } Index: head/sys/libkern/iconv.c =================================================================== --- head/sys/libkern/iconv.c (revision 326270) +++ head/sys/libkern/iconv.c (revision 326271) @@ -1,576 +1,578 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "iconv_converter_if.h" SYSCTL_DECL(_kern_iconv); SYSCTL_NODE(_kern, OID_AUTO, iconv, CTLFLAG_RW, NULL, "kernel iconv interface"); MALLOC_DEFINE(M_ICONV, "iconv", "ICONV structures"); static MALLOC_DEFINE(M_ICONVDATA, "iconv_data", "ICONV data"); MODULE_VERSION(libiconv, 2); static struct sx iconv_lock; #ifdef notnow /* * iconv converter instance */ struct iconv_converter { KOBJ_FIELDS; void * c_data; }; #endif struct sysctl_oid *iconv_oid_hook = &sysctl___kern_iconv; /* * List of loaded converters */ static TAILQ_HEAD(iconv_converter_list, iconv_converter_class) iconv_converters = TAILQ_HEAD_INITIALIZER(iconv_converters); /* * List of supported/loaded charsets pairs */ static TAILQ_HEAD(, iconv_cspair) iconv_cslist = TAILQ_HEAD_INITIALIZER(iconv_cslist); static int iconv_csid = 1; static char iconv_unicode_string[] = "unicode"; /* save eight bytes when possible */ static void iconv_unregister_cspair(struct iconv_cspair *csp); static int iconv_mod_unload(void) { struct iconv_cspair *csp; sx_xlock(&iconv_lock); TAILQ_FOREACH(csp, &iconv_cslist, cp_link) { if (csp->cp_refcount) { sx_xunlock(&iconv_lock); return EBUSY; } } while ((csp = TAILQ_FIRST(&iconv_cslist)) != NULL) iconv_unregister_cspair(csp); sx_xunlock(&iconv_lock); sx_destroy(&iconv_lock); return 0; } static int iconv_mod_handler(module_t mod, int type, void *data) { int error; switch (type) { case MOD_LOAD: error = 0; sx_init(&iconv_lock, "iconv"); break; case MOD_UNLOAD: error = iconv_mod_unload(); break; default: error = EINVAL; } return error; } static moduledata_t iconv_mod = { "iconv", iconv_mod_handler, NULL }; DECLARE_MODULE(iconv, iconv_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); static int iconv_register_converter(struct iconv_converter_class *dcp) { kobj_class_compile((struct kobj_class*)dcp); dcp->refs++; TAILQ_INSERT_TAIL(&iconv_converters, dcp, cc_link); return 0; } static int iconv_unregister_converter(struct iconv_converter_class *dcp) { dcp->refs--; if (dcp->refs > 1) { ICDEBUG("converter has %d references left\n", dcp->refs); return EBUSY; } TAILQ_REMOVE(&iconv_converters, dcp, cc_link); kobj_class_free((struct kobj_class*)dcp); return 0; } static int iconv_lookupconv(const char *name, struct iconv_converter_class **dcpp) { struct iconv_converter_class *dcp; TAILQ_FOREACH(dcp, &iconv_converters, cc_link) { if (name == NULL) continue; if (strcmp(name, ICONV_CONVERTER_NAME(dcp)) == 0) { if (dcpp) *dcpp = dcp; return 0; } } return ENOENT; } static int iconv_lookupcs(const char *to, const char *from, struct iconv_cspair **cspp) { struct iconv_cspair *csp; TAILQ_FOREACH(csp, &iconv_cslist, cp_link) { if (strcasecmp(csp->cp_to, to) == 0 && strcasecmp(csp->cp_from, from) == 0) { if (cspp) *cspp = csp; return 0; } } return ENOENT; } static int iconv_register_cspair(const char *to, const char *from, struct iconv_converter_class *dcp, void *data, struct iconv_cspair **cspp) { struct iconv_cspair *csp; char *cp; int csize, ucsto, ucsfrom; if (iconv_lookupcs(to, from, NULL) == 0) return EEXIST; csize = sizeof(*csp); ucsto = strcmp(to, iconv_unicode_string) == 0; if (!ucsto) csize += strlen(to) + 1; ucsfrom = strcmp(from, iconv_unicode_string) == 0; if (!ucsfrom) csize += strlen(from) + 1; csp = malloc(csize, M_ICONV, M_WAITOK); bzero(csp, csize); csp->cp_id = iconv_csid++; csp->cp_dcp = dcp; cp = (char*)(csp + 1); if (!ucsto) { strcpy(cp, to); csp->cp_to = cp; cp += strlen(cp) + 1; } else csp->cp_to = iconv_unicode_string; if (!ucsfrom) { strcpy(cp, from); csp->cp_from = cp; } else csp->cp_from = iconv_unicode_string; csp->cp_data = data; TAILQ_INSERT_TAIL(&iconv_cslist, csp, cp_link); *cspp = csp; return 0; } static void iconv_unregister_cspair(struct iconv_cspair *csp) { TAILQ_REMOVE(&iconv_cslist, csp, cp_link); if (csp->cp_data) free(csp->cp_data, M_ICONVDATA); free(csp, M_ICONV); } /* * Lookup and create an instance of converter. * Currently this layer didn't have associated 'instance' structure * to avoid unnesessary memory allocation. */ int iconv_open(const char *to, const char *from, void **handle) { struct iconv_cspair *csp, *cspfrom, *cspto; struct iconv_converter_class *dcp; const char *cnvname; int error; /* * First, lookup fully qualified cspairs */ error = iconv_lookupcs(to, from, &csp); if (error == 0) return ICONV_CONVERTER_OPEN(csp->cp_dcp, csp, NULL, handle); /* * Well, nothing found. Now try to construct a composite conversion * ToDo: add a 'capability' field to converter */ TAILQ_FOREACH(dcp, &iconv_converters, cc_link) { cnvname = ICONV_CONVERTER_NAME(dcp); if (cnvname == NULL) continue; error = iconv_lookupcs(cnvname, from, &cspfrom); if (error) continue; error = iconv_lookupcs(to, cnvname, &cspto); if (error) continue; /* * Fine, we're found a pair which can be combined together */ return ICONV_CONVERTER_OPEN(dcp, cspto, cspfrom, handle); } return ENOENT; } int iconv_close(void *handle) { return ICONV_CONVERTER_CLOSE(handle); } int iconv_conv(void *handle, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { return ICONV_CONVERTER_CONV(handle, inbuf, inbytesleft, outbuf, outbytesleft, 0, 0); } int iconv_conv_case(void *handle, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int casetype) { return ICONV_CONVERTER_CONV(handle, inbuf, inbytesleft, outbuf, outbytesleft, 0, casetype); } int iconv_convchr(void *handle, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { return ICONV_CONVERTER_CONV(handle, inbuf, inbytesleft, outbuf, outbytesleft, 1, 0); } int iconv_convchr_case(void *handle, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int casetype) { return ICONV_CONVERTER_CONV(handle, inbuf, inbytesleft, outbuf, outbytesleft, 1, casetype); } int towlower(int c, void *handle) { return ICONV_CONVERTER_TOLOWER(handle, c); } int towupper(int c, void *handle) { return ICONV_CONVERTER_TOUPPER(handle, c); } /* * Give a list of loaded converters. Each name terminated with 0. * An empty string terminates the list. */ static int iconv_sysctl_drvlist(SYSCTL_HANDLER_ARGS) { struct iconv_converter_class *dcp; const char *name; char spc; int error; error = 0; sx_slock(&iconv_lock); TAILQ_FOREACH(dcp, &iconv_converters, cc_link) { name = ICONV_CONVERTER_NAME(dcp); if (name == NULL) continue; error = SYSCTL_OUT(req, name, strlen(name) + 1); if (error) break; } sx_sunlock(&iconv_lock); if (error) return error; spc = 0; error = SYSCTL_OUT(req, &spc, sizeof(spc)); return error; } SYSCTL_PROC(_kern_iconv, OID_AUTO, drvlist, CTLFLAG_RD | CTLTYPE_OPAQUE, NULL, 0, iconv_sysctl_drvlist, "S,xlat", "registered converters"); /* * List all available charset pairs. */ static int iconv_sysctl_cslist(SYSCTL_HANDLER_ARGS) { struct iconv_cspair *csp; struct iconv_cspair_info csi; int error; error = 0; bzero(&csi, sizeof(csi)); csi.cs_version = ICONV_CSPAIR_INFO_VER; sx_slock(&iconv_lock); TAILQ_FOREACH(csp, &iconv_cslist, cp_link) { csi.cs_id = csp->cp_id; csi.cs_refcount = csp->cp_refcount; csi.cs_base = csp->cp_base ? csp->cp_base->cp_id : 0; strcpy(csi.cs_to, csp->cp_to); strcpy(csi.cs_from, csp->cp_from); error = SYSCTL_OUT(req, &csi, sizeof(csi)); if (error) break; } sx_sunlock(&iconv_lock); return error; } SYSCTL_PROC(_kern_iconv, OID_AUTO, cslist, CTLFLAG_RD | CTLTYPE_OPAQUE, NULL, 0, iconv_sysctl_cslist, "S,xlat", "registered charset pairs"); int iconv_add(const char *converter, const char *to, const char *from) { struct iconv_converter_class *dcp; struct iconv_cspair *csp; if (iconv_lookupconv(converter, &dcp) != 0) return EINVAL; return iconv_register_cspair(to, from, dcp, NULL, &csp); } /* * Add new charset pair */ static int iconv_sysctl_add(SYSCTL_HANDLER_ARGS) { struct iconv_converter_class *dcp; struct iconv_cspair *csp; struct iconv_add_in din; struct iconv_add_out dout; int error; error = SYSCTL_IN(req, &din, sizeof(din)); if (error) return error; if (din.ia_version != ICONV_ADD_VER) return EINVAL; if (din.ia_datalen > ICONV_CSMAXDATALEN) return EINVAL; if (strlen(din.ia_from) >= ICONV_CSNMAXLEN) return EINVAL; if (strlen(din.ia_to) >= ICONV_CSNMAXLEN) return EINVAL; if (strlen(din.ia_converter) >= ICONV_CNVNMAXLEN) return EINVAL; if (iconv_lookupconv(din.ia_converter, &dcp) != 0) return EINVAL; sx_xlock(&iconv_lock); error = iconv_register_cspair(din.ia_to, din.ia_from, dcp, NULL, &csp); if (error) { sx_xunlock(&iconv_lock); return error; } if (din.ia_datalen) { csp->cp_data = malloc(din.ia_datalen, M_ICONVDATA, M_WAITOK); error = copyin(din.ia_data, csp->cp_data, din.ia_datalen); if (error) goto bad; } dout.ia_csid = csp->cp_id; error = SYSCTL_OUT(req, &dout, sizeof(dout)); if (error) goto bad; sx_xunlock(&iconv_lock); ICDEBUG("%s => %s, %d bytes\n",din.ia_from, din.ia_to, din.ia_datalen); return 0; bad: iconv_unregister_cspair(csp); sx_xunlock(&iconv_lock); return error; } SYSCTL_PROC(_kern_iconv, OID_AUTO, add, CTLFLAG_RW | CTLTYPE_OPAQUE, NULL, 0, iconv_sysctl_add, "S,xlat", "register charset pair"); /* * Default stubs for converters */ int iconv_converter_initstub(struct iconv_converter_class *dp) { return 0; } int iconv_converter_donestub(struct iconv_converter_class *dp) { return 0; } int iconv_converter_tolowerstub(int c, void *handle) { return (c); } int iconv_converter_handler(module_t mod, int type, void *data) { struct iconv_converter_class *dcp = data; int error; switch (type) { case MOD_LOAD: sx_xlock(&iconv_lock); error = iconv_register_converter(dcp); if (error) { sx_xunlock(&iconv_lock); break; } error = ICONV_CONVERTER_INIT(dcp); if (error) iconv_unregister_converter(dcp); sx_xunlock(&iconv_lock); break; case MOD_UNLOAD: sx_xlock(&iconv_lock); ICONV_CONVERTER_DONE(dcp); error = iconv_unregister_converter(dcp); sx_xunlock(&iconv_lock); break; default: error = EINVAL; } return error; } /* * Common used functions (don't use with unicode) */ char * iconv_convstr(void *handle, char *dst, const char *src) { char *p = dst; size_t inlen, outlen; int error; if (handle == NULL) { strcpy(dst, src); return dst; } inlen = outlen = strlen(src); error = iconv_conv(handle, NULL, NULL, &p, &outlen); if (error) return NULL; error = iconv_conv(handle, &src, &inlen, &p, &outlen); if (error) return NULL; *p = 0; return dst; } void * iconv_convmem(void *handle, void *dst, const void *src, int size) { const char *s = src; char *d = dst; size_t inlen, outlen; int error; if (size == 0) return dst; if (handle == NULL) { memcpy(dst, src, size); return dst; } inlen = outlen = size; error = iconv_conv(handle, NULL, NULL, &d, &outlen); if (error) return NULL; error = iconv_conv(handle, &s, &inlen, &d, &outlen); if (error) return NULL; return dst; } int iconv_lookupcp(char **cpp, const char *s) { if (cpp == NULL) { ICDEBUG("warning a NULL list passed\n", ""); return ENOENT; } for (; *cpp; cpp++) if (strcmp(*cpp, s) == 0) return 0; return ENOENT; } /* * Return if fsname is in use of not */ int iconv_vfs_refcount(const char *fsname) { struct vfsconf *vfsp; vfsp = vfs_byname(fsname); if (vfsp != NULL && vfsp->vfc_refcount > 0) return (EBUSY); return (0); } Index: head/sys/libkern/iconv_ucs.c =================================================================== --- head/sys/libkern/iconv_ucs.c (revision 326270) +++ head/sys/libkern/iconv_ucs.c (revision 326271) @@ -1,538 +1,540 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003, 2005 Ryuichiro Imura * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "iconv_converter_if.h" /* * "UCS" converter */ #define KICONV_UCS_COMBINE 0x1 #define KICONV_UCS_FROM_UTF8 0x2 #define KICONV_UCS_TO_UTF8 0x4 #define KICONV_UCS_FROM_LE 0x8 #define KICONV_UCS_TO_LE 0x10 #define KICONV_UCS_FROM_UTF16 0x20 #define KICONV_UCS_TO_UTF16 0x40 #define KICONV_UCS_UCS4 0x80 #define ENCODING_UTF16 "UTF-16BE" #define ENCODING_UTF8 "UTF-8" static struct { const char *name; int from_flag, to_flag; } unicode_family[] = { { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 }, { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE }, { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 }, { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE, KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE }, { NULL, 0, 0 } }; static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen); static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen); static uint32_t encode_surrogate(uint32_t code); static uint32_t decode_surrogate(const u_char *ucs); #ifdef MODULE_DEPEND MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2); #endif /* * UCS converter instance */ struct iconv_ucs { KOBJ_FIELDS; int convtype; struct iconv_cspair * d_csp; struct iconv_cspair * d_cspf; void * f_ctp; void * t_ctp; void * ctype; }; static int iconv_ucs_open(struct iconv_converter_class *dcp, struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) { struct iconv_ucs *dp; int i; const char *from, *to; dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); to = csp->cp_to; from = cspf ? cspf->cp_from : csp->cp_from; dp->convtype = 0; if (cspf) dp->convtype |= KICONV_UCS_COMBINE; for (i = 0; unicode_family[i].name; i++) { if (strcasecmp(from, unicode_family[i].name) == 0) dp->convtype |= unicode_family[i].from_flag; if (strcasecmp(to, unicode_family[i].name) == 0) dp->convtype |= unicode_family[i].to_flag; } if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0) dp->convtype |= KICONV_UCS_UCS4; else dp->convtype &= ~KICONV_UCS_UCS4; dp->f_ctp = dp->t_ctp = NULL; if (dp->convtype & KICONV_UCS_COMBINE) { if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 && (dp->convtype & KICONV_UCS_FROM_LE) == 0) { iconv_open(ENCODING_UNICODE, from, &dp->f_ctp); } if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 && (dp->convtype & KICONV_UCS_TO_LE) == 0) { iconv_open(to, ENCODING_UNICODE, &dp->t_ctp); } } dp->ctype = NULL; if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8)) iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype); dp->d_csp = csp; if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) { if (cspf) { dp->d_cspf = cspf; cspf->cp_refcount++; } else csp->cp_refcount++; } if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) csp->cp_refcount++; *dpp = (void*)dp; return 0; } static int iconv_ucs_close(void *data) { struct iconv_ucs *dp = data; if (dp->f_ctp) iconv_close(dp->f_ctp); if (dp->t_ctp) iconv_close(dp->t_ctp); if (dp->ctype) iconv_close(dp->ctype); if (dp->d_cspf) dp->d_cspf->cp_refcount--; else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) dp->d_csp->cp_refcount--; if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) dp->d_csp->cp_refcount--; kobj_delete((struct kobj*)data, M_ICONV); return 0; } static int iconv_ucs_conv(void *d2p, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int convchar, int casetype) { struct iconv_ucs *dp = (struct iconv_ucs*)d2p; int ret = 0, i; size_t in, on, ir, or, inlen, outlen, ucslen; const char *src, *p; char *dst; u_char ucs[4], *q; uint32_t code; if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) return 0; ir = in = *inbytesleft; or = on = *outbytesleft; src = *inbuf; dst = *outbuf; while (ir > 0 && or > 0) { /* * The first half of conversion. * (convert any code into ENCODING_UNICODE) */ code = 0; p = src; if (dp->convtype & KICONV_UCS_FROM_UTF8) { /* convert UTF-8 to ENCODING_UNICODE */ inlen = 0; code = utf8_to_ucs4(p, &inlen, ir); if (code == 0) { ret = -1; break; } if (casetype == KICONV_FROM_LOWER && dp->ctype) { code = towlower(code, dp->ctype); } else if (casetype == KICONV_FROM_UPPER && dp->ctype) { code = towupper(code, dp->ctype); } if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) { /* reserved for utf-16 surrogate pair */ /* invalid unicode */ ret = -1; break; } if (inlen == 4) { if (dp->convtype & KICONV_UCS_UCS4) { ucslen = 4; code = encode_surrogate(code); } else { /* can't handle with ucs-2 */ ret = -1; break; } } else { ucslen = 2; } /* save UCS-4 into ucs[] */ for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--) *q++ = (code >> (i << 3)) & 0xff; } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) { /* convert local code to ENCODING_UNICODE */ ucslen = 4; inlen = ir; q = ucs; ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q, &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER)); if (ret) break; inlen = ir - inlen; ucslen = 4 - ucslen; } else { /* src code is a proper subset of ENCODING_UNICODE */ q = ucs; if (dp->convtype & KICONV_UCS_FROM_LE) { *q = *(p + 1); *(q + 1) = *p; p += 2; } else { *q = *p++; *(q + 1) = *p++; } if ((*q & 0xfc) == 0xd8) { if (dp->convtype & KICONV_UCS_UCS4 && dp->convtype & KICONV_UCS_FROM_UTF16) { inlen = ucslen = 4; } else { /* invalid unicode */ ret = -1; break; } } else { inlen = ucslen = 2; } if (ir < inlen) { ret = -1; break; } if (ucslen == 4) { q += 2; if (dp->convtype & KICONV_UCS_FROM_LE) { *q = *(p + 1); *(q + 1) = *p; } else { *q = *p++; *(q + 1) = *p; } if ((*q & 0xfc) != 0xdc) { /* invalid unicode */ ret = -1; break; } } } /* * The second half of conversion. * (convert ENCODING_UNICODE into any code) */ p = ucs; if (dp->convtype & KICONV_UCS_TO_UTF8) { q = (u_char *)dst; if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) { /* decode surrogate pair */ code = decode_surrogate(p); } else { code = (ucs[0] << 8) | ucs[1]; } if (casetype == KICONV_LOWER && dp->ctype) { code = towlower(code, dp->ctype); } else if (casetype == KICONV_UPPER && dp->ctype) { code = towupper(code, dp->ctype); } outlen = 0; if (ucs4_to_utf8(code, q, &outlen, or) == NULL) { ret = -1; break; } src += inlen; ir -= inlen; dst += outlen; or -= outlen; } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) { ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst, &or, casetype & (KICONV_LOWER | KICONV_UPPER)); if (ret) break; src += inlen; ir -= inlen; } else { /* dst code is a proper subset of ENCODING_UNICODE */ if (or < ucslen) { ret = -1; break; } src += inlen; ir -= inlen; or -= ucslen; if (dp->convtype & KICONV_UCS_TO_LE) { *dst++ = *(p + 1); *dst++ = *p; p += 2; } else { *dst++ = *p++; *dst++ = *p++; } if (ucslen == 4) { if ((dp->convtype & KICONV_UCS_UCS4) == 0 || (dp->convtype & KICONV_UCS_TO_UTF16) == 0) { ret = -1; break; } if (dp->convtype & KICONV_UCS_TO_LE) { *dst++ = *(p + 1); *dst++ = *p; } else { *dst++ = *p++; *dst++ = *p; } } } if (convchar == 1) break; } *inbuf += in - ir; *outbuf += on - or; *inbytesleft -= in - ir; *outbytesleft -= on - or; return (ret); } static int iconv_ucs_init(struct iconv_converter_class *dcp) { int error; error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8); if (error) return (error); error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE); if (error) return (error); return (0); } static int iconv_ucs_done(struct iconv_converter_class *dcp) { return (0); } static const char * iconv_ucs_name(struct iconv_converter_class *dcp) { return (ENCODING_UNICODE); } static kobj_method_t iconv_ucs_methods[] = { KOBJMETHOD(iconv_converter_open, iconv_ucs_open), KOBJMETHOD(iconv_converter_close, iconv_ucs_close), KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv), KOBJMETHOD(iconv_converter_init, iconv_ucs_init), KOBJMETHOD(iconv_converter_done, iconv_ucs_done), KOBJMETHOD(iconv_converter_name, iconv_ucs_name), {0, 0} }; KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs)); static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen) { size_t i, w = 0; uint32_t ucs4 = 0; /* * get leading 1 byte from utf-8 */ if ((*src & 0x80) == 0) { /* * leading 1 bit is "0" * utf-8: 0xxxxxxx * ucs-4: 00000000 00000000 00000000 0xxxxxxx */ w = 1; /* get trailing 7 bits */ ucs4 = *src & 0x7f; } else if ((*src & 0xe0) == 0xc0) { /* * leading 3 bits are "110" * utf-8: 110xxxxx 10yyyyyy * ucs-4: 00000000 00000000 00000xxx xxyyyyyy */ w = 2; /* get trailing 5 bits */ ucs4 = *src & 0x1f; } else if ((*src & 0xf0) == 0xe0) { /* * leading 4 bits are "1110" * utf-8: 1110xxxx 10yyyyyy 10zzzzzz * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz */ w = 3; /* get trailing 4 bits */ ucs4 = *src & 0x0f; } else if ((*src & 0xf8) == 0xf0) { /* * leading 5 bits are "11110" * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz */ w = 4; /* get trailing 3 bits */ ucs4 = *src & 0x07; } else { /* out of utf-16 range or having illegal bits */ return (0); } if (srclen < w) return (0); /* * get left parts from utf-8 */ for (i = 1 ; i < w ; i++) { if ((*(src + i) & 0xc0) != 0x80) { /* invalid: leading 2 bits are not "10" */ return (0); } /* concatenate trailing 6 bits into ucs4 */ ucs4 <<= 6; ucs4 |= *(src + i) & 0x3f; } *utf8width = w; return (ucs4); } static u_char * ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen) { u_char lead, *p; size_t i, w; /* * determine utf-8 width and leading bits */ if (ucs4 < 0x80) { w = 1; lead = 0; /* "0" */ } else if (ucs4 < 0x800) { w = 2; lead = 0xc0; /* "11" */ } else if (ucs4 < 0x10000) { w = 3; lead = 0xe0; /* "111" */ } else if (ucs4 < 0x200000) { w = 4; lead = 0xf0; /* "1111" */ } else { return (NULL); } if (dstlen < w) return (NULL); /* * construct utf-8 */ p = dst; for (i = w - 1 ; i >= 1 ; i--) { /* get trailing 6 bits and put it with leading bit as "1" */ *(p + i) = (ucs4 & 0x3f) | 0x80; ucs4 >>= 6; } *p = ucs4 | lead; *utf8width = w; return (p); } static uint32_t encode_surrogate(uint32_t code) { return ((((code - 0x10000) << 6) & 0x3ff0000) | ((code - 0x10000) & 0x3ff) | 0xd800dc00); } static uint32_t decode_surrogate(const u_char *ucs) { return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) | ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000); } Index: head/sys/libkern/iconv_xlat.c =================================================================== --- head/sys/libkern/iconv_xlat.c (revision 326270) +++ head/sys/libkern/iconv_xlat.c (revision 326271) @@ -1,126 +1,128 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "iconv_converter_if.h" /* * "XLAT" converter */ #ifdef MODULE_DEPEND MODULE_DEPEND(iconv_xlat, libiconv, 2, 2, 2); #endif /* * XLAT converter instance */ struct iconv_xlat { KOBJ_FIELDS; u_char * d_table; struct iconv_cspair * d_csp; }; static int iconv_xlat_open(struct iconv_converter_class *dcp, struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) { struct iconv_xlat *dp; dp = (struct iconv_xlat *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); dp->d_table = csp->cp_data; dp->d_csp = csp; csp->cp_refcount++; *dpp = (void*)dp; return 0; } static int iconv_xlat_close(void *data) { struct iconv_xlat *dp = data; dp->d_csp->cp_refcount--; kobj_delete((struct kobj*)data, M_ICONV); return 0; } static int iconv_xlat_conv(void *d2p, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int convchar, int casetype) { struct iconv_xlat *dp = (struct iconv_xlat*)d2p; const char *src; char *dst; int n, r; if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) return 0; if (casetype != 0) return -1; if (convchar == 1) r = n = 1; else r = n = min(*inbytesleft, *outbytesleft); src = *inbuf; dst = *outbuf; while(r--) *dst++ = dp->d_table[(u_char)*src++]; *inbuf += n; *outbuf += n; *inbytesleft -= n; *outbytesleft -= n; return 0; } static const char * iconv_xlat_name(struct iconv_converter_class *dcp) { return "xlat"; } static kobj_method_t iconv_xlat_methods[] = { KOBJMETHOD(iconv_converter_open, iconv_xlat_open), KOBJMETHOD(iconv_converter_close, iconv_xlat_close), KOBJMETHOD(iconv_converter_conv, iconv_xlat_conv), #if 0 KOBJMETHOD(iconv_converter_init, iconv_xlat_init), KOBJMETHOD(iconv_converter_done, iconv_xlat_done), #endif KOBJMETHOD(iconv_converter_name, iconv_xlat_name), {0, 0} }; KICONV_CONVERTER(xlat, sizeof(struct iconv_xlat)); Index: head/sys/libkern/iconv_xlat16.c =================================================================== --- head/sys/libkern/iconv_xlat16.c (revision 326270) +++ head/sys/libkern/iconv_xlat16.c (revision 326271) @@ -1,363 +1,365 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003, 2005 Ryuichiro Imura * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "iconv_converter_if.h" /* * "XLAT16" converter */ #ifdef MODULE_DEPEND MODULE_DEPEND(iconv_xlat16, libiconv, 2, 2, 2); #endif #define C2I1(c) ((c) & 0x8000 ? ((c) & 0xff) | 0x100 : (c) & 0xff) #define C2I2(c) ((c) & 0x8000 ? ((c) >> 8) & 0x7f : ((c) >> 8) & 0xff) /* * XLAT16 converter instance */ struct iconv_xlat16 { KOBJ_FIELDS; uint32_t * d_table[0x200]; void * f_ctp; void * t_ctp; struct iconv_cspair * d_csp; }; static int iconv_xlat16_open(struct iconv_converter_class *dcp, struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) { struct iconv_xlat16 *dp; uint32_t *headp, **idxp; int i; dp = (struct iconv_xlat16 *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); headp = (uint32_t *)((caddr_t)csp->cp_data + sizeof(dp->d_table)); idxp = (uint32_t **)csp->cp_data; for (i = 0 ; i < 0x200 ; i++) { if (*idxp) { dp->d_table[i] = headp; headp += 0x80; } else { dp->d_table[i] = NULL; } idxp++; } if (strcmp(csp->cp_to, KICONV_WCTYPE_NAME) != 0) { if (iconv_open(KICONV_WCTYPE_NAME, csp->cp_from, &dp->f_ctp) != 0) dp->f_ctp = NULL; if (iconv_open(KICONV_WCTYPE_NAME, csp->cp_to, &dp->t_ctp) != 0) dp->t_ctp = NULL; } else { dp->f_ctp = dp->t_ctp = dp; } dp->d_csp = csp; csp->cp_refcount++; *dpp = (void*)dp; return (0); } static int iconv_xlat16_close(void *data) { struct iconv_xlat16 *dp = data; if (dp->f_ctp && dp->f_ctp != data) iconv_close(dp->f_ctp); if (dp->t_ctp && dp->t_ctp != data) iconv_close(dp->t_ctp); dp->d_csp->cp_refcount--; kobj_delete((struct kobj*)data, M_ICONV); return (0); } static int iconv_xlat16_conv(void *d2p, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int convchar, int casetype) { struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p; const char *src; char *dst; int nullin, ret = 0; size_t in, on, ir, or, inlen; uint32_t code; u_char u, l; uint16_t c1, c2, ctmp; if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) return (0); ir = in = *inbytesleft; or = on = *outbytesleft; src = *inbuf; dst = *outbuf; while(ir > 0 && or > 0) { inlen = 0; code = 0; c1 = ir > 1 ? *(src+1) & 0xff : 0; c2 = *src & 0xff; ctmp = 0; c1 = c2 & 0x80 ? c1 | 0x100 : c1; c2 = c2 & 0x80 ? c2 & 0x7f : c2; if (ir > 1 && dp->d_table[c1] && dp->d_table[c1][c2]) { /* * inbuf char is a double byte char */ inlen = 2; /* toupper,tolower */ if (casetype == KICONV_FROM_LOWER && dp->f_ctp) ctmp = towlower(((u_char)*src << 8) | (u_char)*(src + 1), dp->f_ctp); else if (casetype == KICONV_FROM_UPPER && dp->f_ctp) ctmp = towupper(((u_char)*src << 8) | (u_char)*(src + 1), dp->f_ctp); if (ctmp) { c1 = C2I1(ctmp); c2 = C2I2(ctmp); } } if (inlen == 0) { c1 &= 0xff00; if (!dp->d_table[c1]) { ret = -1; break; } /* * inbuf char is a single byte char */ inlen = 1; if (casetype & (KICONV_FROM_LOWER|KICONV_FROM_UPPER)) code = dp->d_table[c1][c2]; if (casetype == KICONV_FROM_LOWER) { if (dp->f_ctp) ctmp = towlower((u_char)*src, dp->f_ctp); else if (code & XLAT16_HAS_FROM_LOWER_CASE) ctmp = (u_char)(code >> 16); } else if (casetype == KICONV_FROM_UPPER) { if (dp->f_ctp) ctmp = towupper((u_char)*src, dp->f_ctp); else if (code & XLAT16_HAS_FROM_UPPER_CASE) ctmp = (u_char)(code >> 16); } if (ctmp) { c1 = C2I1(ctmp << 8); c2 = C2I2(ctmp << 8); } } code = dp->d_table[c1][c2]; if (!code) { ret = -1; break; } nullin = (code & XLAT16_ACCEPT_NULL_IN) ? 1 : 0; if (inlen == 1 && nullin) { /* * XLAT16_ACCEPT_NULL_IN requires inbuf has 2byte */ ret = -1; break; } /* * now start translation */ u = (u_char)(code >> 8); l = (u_char)code; #ifdef XLAT16_ACCEPT_3BYTE_CHR if (code & XLAT16_IS_3BYTE_CHR) { if (or < 3) { ret = -1; break; } *dst++ = u; *dst++ = l; *dst++ = (u_char)(code >> 16); or -= 3; } else #endif if (u || code & XLAT16_ACCEPT_NULL_OUT) { if (or < 2) { ret = -1; break; } /* toupper,tolower */ if (casetype == KICONV_LOWER && dp->t_ctp) { code = towlower((uint16_t)code, dp->t_ctp); u = (u_char)(code >> 8); l = (u_char)code; } if (casetype == KICONV_UPPER && dp->t_ctp) { code = towupper((uint16_t)code, dp->t_ctp); u = (u_char)(code >> 8); l = (u_char)code; } *dst++ = u; *dst++ = l; or -= 2; } else { /* toupper,tolower */ if (casetype == KICONV_LOWER) { if (dp->t_ctp) l = (u_char)towlower(l, dp->t_ctp); else if (code & XLAT16_HAS_LOWER_CASE) l = (u_char)(code >> 16); } if (casetype == KICONV_UPPER) { if (dp->t_ctp) l = (u_char)towupper(l, dp->t_ctp); else if (code & XLAT16_HAS_UPPER_CASE) l = (u_char)(code >> 16); } *dst++ = l; or--; } if (inlen == 2) { /* * there is a case that inbuf char is a single * byte char while inlen == 2 */ if ((u_char)*(src+1) == '\0' && !nullin ) { src++; ir--; } else { src += 2; ir -= 2; } } else { src++; ir--; } if (convchar == 1) break; } *inbuf += in - ir; *outbuf += on - or; *inbytesleft -= in - ir; *outbytesleft -= on - or; return (ret); } static const char * iconv_xlat16_name(struct iconv_converter_class *dcp) { return ("xlat16"); } static int iconv_xlat16_tolower(void *d2p, int c) { struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p; int c1, c2, out; if (c < 0x100) { c1 = C2I1(c << 8); c2 = C2I2(c << 8); } else if (c < 0x10000) { c1 = C2I1(c); c2 = C2I2(c); } else return (c); if (dp->d_table[c1] && dp->d_table[c1][c2] & XLAT16_HAS_LOWER_CASE) { /*return (int)(dp->d_table[c1][c2] & 0xffff);*/ out = dp->d_table[c1][c2] & 0xffff; if ((out & 0xff) == 0) out = (out >> 8) & 0xff; return (out); } else return (c); } static int iconv_xlat16_toupper(void *d2p, int c) { struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p; int c1, c2, out; if (c < 0x100) { c1 = C2I1(c << 8); c2 = C2I2(c << 8); } else if (c < 0x10000) { c1 = C2I1(c); c2 = C2I2(c); } else return (c); if (dp->d_table[c1] && dp->d_table[c1][c2] & XLAT16_HAS_UPPER_CASE) { out = dp->d_table[c1][c2] & 0xffff; if ((out & 0xff) == 0) out = (out >> 8) & 0xff; return (out); } else return (c); } static kobj_method_t iconv_xlat16_methods[] = { KOBJMETHOD(iconv_converter_open, iconv_xlat16_open), KOBJMETHOD(iconv_converter_close, iconv_xlat16_close), KOBJMETHOD(iconv_converter_conv, iconv_xlat16_conv), #if 0 KOBJMETHOD(iconv_converter_init, iconv_xlat16_init), KOBJMETHOD(iconv_converter_done, iconv_xlat16_done), #endif KOBJMETHOD(iconv_converter_name, iconv_xlat16_name), KOBJMETHOD(iconv_converter_tolower, iconv_xlat16_tolower), KOBJMETHOD(iconv_converter_toupper, iconv_xlat16_toupper), {0, 0} }; KICONV_CONVERTER(xlat16, sizeof(struct iconv_xlat16)); Index: head/sys/libkern/inet_aton.c =================================================================== --- head/sys/libkern/inet_aton.c (revision 326270) +++ head/sys/libkern/inet_aton.c (revision 326271) @@ -1,136 +1,138 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2001 Charles Mott * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include int inet_aton(const char *cp, struct in_addr *addr) { u_long parts[4]; in_addr_t val; const char *c; char *endptr; int gotend, n; c = (const char *)cp; n = 0; /* * Run through the string, grabbing numbers until * the end of the string, or some error */ gotend = 0; while (!gotend) { unsigned long l; l = strtoul(c, &endptr, 0); if (l == ULONG_MAX || (l == 0 && endptr == c)) return (0); val = (in_addr_t)l; /* * If the whole string is invalid, endptr will equal * c.. this way we can make sure someone hasn't * gone '.12' or something which would get past * the next check. */ if (endptr == c) return (0); parts[n] = val; c = endptr; /* Check the next character past the previous number's end */ switch (*c) { case '.' : /* Make sure we only do 3 dots .. */ if (n == 3) /* Whoops. Quit. */ return (0); n++; c++; break; case '\0': gotend = 1; break; default: if (isspace((unsigned char)*c)) { gotend = 1; break; } else { /* Invalid character, then fail. */ return (0); } } } /* Concoct the address according to the number of parts specified. */ switch (n) { case 0: /* a -- 32 bits */ /* * Nothing is necessary here. Overflow checking was * already done in strtoul(). */ break; case 1: /* a.b -- 8.24 bits */ if (val > 0xffffff || parts[0] > 0xff) return (0); val |= parts[0] << 24; break; case 2: /* a.b.c -- 8.8.16 bits */ if (val > 0xffff || parts[0] > 0xff || parts[1] > 0xff) return (0); val |= (parts[0] << 24) | (parts[1] << 16); break; case 3: /* a.b.c.d -- 8.8.8.8 bits */ if (val > 0xff || parts[0] > 0xff || parts[1] > 0xff || parts[2] > 0xff) return (0); val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8); break; } if (addr != NULL) addr->s_addr = htonl(val); return (1); } Index: head/sys/libkern/memcchr.c =================================================================== --- head/sys/libkern/memcchr.c (revision 326270) +++ head/sys/libkern/memcchr.c (revision 326271) @@ -1,115 +1,117 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Ed Schouten * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include /* * memcchr(): find first character in buffer not matching `c'. * * This function performs the complement of memchr(). To provide decent * performance, this function compares data from the buffer one word at * a time. * * This code is inspired by libc's strlen(), written by Xin Li. */ #if LONG_BIT != 32 && LONG_BIT != 64 #error Unsupported word size #endif #define LONGPTR_MASK (sizeof(long) - 1) #define TESTBYTE \ do { \ if (*p != (unsigned char)c) \ goto done; \ p++; \ } while (0) void * memcchr(const void *begin, int c, size_t n) { const unsigned long *lp; const unsigned char *p, *end; unsigned long word; /* Four or eight repetitions of `c'. */ word = (unsigned char)c; word |= word << 8; word |= word << 16; #if LONG_BIT >= 64 word |= word << 32; #endif /* Don't perform memory I/O when passing a zero-length buffer. */ if (n == 0) return (NULL); /* * First determine whether there is a character unequal to `c' * in the first word. As this word may contain bytes before * `begin', we may execute this loop spuriously. */ lp = (const unsigned long *)((uintptr_t)begin & ~LONGPTR_MASK); end = (const unsigned char *)begin + n; if (*lp++ != word) for (p = begin; p < (const unsigned char *)lp;) TESTBYTE; /* Now compare the data one word at a time. */ for (; (const unsigned char *)lp < end; lp++) { if (*lp != word) { p = (const unsigned char *)lp; TESTBYTE; TESTBYTE; TESTBYTE; #if LONG_BIT >= 64 TESTBYTE; TESTBYTE; TESTBYTE; TESTBYTE; #endif goto done; } } return (NULL); done: /* * If the end of the buffer is not word aligned, the previous * loops may obtain an address that's beyond the end of the * buffer. */ if (p < end) return (__DECONST(void *, p)); return (NULL); } Index: head/sys/libkern/memmove.c =================================================================== --- head/sys/libkern/memmove.c (revision 326270) +++ head/sys/libkern/memmove.c (revision 326271) @@ -1,38 +1,40 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009 Roman Divacky * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include void * memmove(void *dest, const void *src, size_t n) { bcopy(src, dest, n); return (dest); } Index: head/sys/libkern/memset.c =================================================================== --- head/sys/libkern/memset.c (revision 326270) +++ head/sys/libkern/memset.c (revision 326271) @@ -1,44 +1,46 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (C) 1992-2007 The FreeBSD Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define LIBKERN_INLINE #include #include void * memset(void *b, int c, size_t len) { char *bb; if (c == 0) bzero(b, len); else for (bb = (char *)b; len--; ) *bb++ = c; return (b); } Index: head/sys/libkern/strcspn.c =================================================================== --- head/sys/libkern/strcspn.c (revision 326270) +++ head/sys/libkern/strcspn.c (revision 326271) @@ -1,72 +1,74 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 David Schultz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #define IDX(c) ((u_char)(c) / LONG_BIT) #define BIT(c) ((u_long)1 << ((u_char)(c) % LONG_BIT)) size_t strcspn(const char * __restrict s, const char * __restrict charset) { /* * NB: idx and bit are temporaries whose use causes gcc 3.4.2 to * generate better code. Without them, gcc gets a little confused. */ const char *s1; u_long bit; u_long tbl[(UCHAR_MAX + 1) / LONG_BIT]; int idx; if(*s == '\0') return (0); #if LONG_BIT == 64 /* always better to unroll on 64-bit architectures */ tbl[0] = 1; tbl[3] = tbl[2] = tbl[1] = 0; #else for (tbl[0] = idx = 1; idx < sizeof(tbl) / sizeof(tbl[0]); idx++) tbl[idx] = 0; #endif for (; *charset != '\0'; charset++) { idx = IDX(*charset); bit = BIT(*charset); tbl[idx] |= bit; } for(s1 = s; ; s1++) { idx = IDX(*s1); bit = BIT(*s1); if ((tbl[idx] & bit) != 0) break; } return (s1 - s); } Index: head/sys/libkern/strdup.c =================================================================== --- head/sys/libkern/strdup.c (revision 326270) +++ head/sys/libkern/strdup.c (revision 326271) @@ -1,50 +1,52 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 * ("CBOSS"), as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include char * strdup(const char *string, struct malloc_type *type) { size_t len; char *copy; len = strlen(string) + 1; copy = malloc(len, type, M_WAITOK); bcopy(string, copy, len); return (copy); } Index: head/sys/libkern/strlcat.c =================================================================== --- head/sys/libkern/strlcat.c (revision 326270) +++ head/sys/libkern/strlcat.c (revision 326271) @@ -1,72 +1,74 @@ /* $OpenBSD: strlcat.c,v 1.2 1999/06/17 16:28:58 millert Exp $ */ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 1998 Todd C. Miller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char *rcsid = "$OpenBSD: strlcat.c,v 1.2 1999/06/17 16:28:58 millert Exp $"; #endif /* LIBC_SCCS and not lint */ #include __FBSDID("$FreeBSD$"); #include #include /* * Appends src to string dst of size siz (unlike strncat, siz is the * full size of dst, not space left). At most siz-1 characters * will be copied. Always NUL terminates (unless siz <= strlen(dst)). * Returns strlen(src) + MIN(siz, strlen(initial dst)). * If retval >= siz, truncation occurred. */ size_t strlcat(char *dst, const char *src, size_t siz) { char *d = dst; const char *s = src; size_t n = siz; size_t dlen; /* Find the end of dst and adjust bytes left but don't go past end */ while (n-- != 0 && *d != '\0') d++; dlen = d - dst; n = siz - dlen; if (n == 0) return(dlen + strlen(s)); while (*s != '\0') { if (n != 1) { *d++ = *s; n--; } s++; } *d = '\0'; return(dlen + (s - src)); /* count does not include NUL */ } Index: head/sys/libkern/strlen.c =================================================================== --- head/sys/libkern/strlen.c (revision 326270) +++ head/sys/libkern/strlen.c (revision 326271) @@ -1,129 +1,131 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009, 2010 Xin LI * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include /* * Portable strlen() for 32-bit and 64-bit systems. * * Rationale: it is generally much more efficient to do word length * operations and avoid branches on modern computer systems, as * compared to byte-length operations with a lot of branches. * * The expression: * * ((x - 0x01....01) & ~x & 0x80....80) * * would evaluate to a non-zero value iff any of the bytes in the * original word is zero. * * On multi-issue processors, we can divide the above expression into: * a) (x - 0x01....01) * b) (~x & 0x80....80) * c) a & b * * Where, a) and b) can be partially computed in parallel. * * The algorithm above is found on "Hacker's Delight" by * Henry S. Warren, Jr. */ /* Magic numbers for the algorithm */ #if LONG_BIT == 32 static const unsigned long mask01 = 0x01010101; static const unsigned long mask80 = 0x80808080; #elif LONG_BIT == 64 static const unsigned long mask01 = 0x0101010101010101; static const unsigned long mask80 = 0x8080808080808080; #else #error Unsupported word size #endif #define LONGPTR_MASK (sizeof(long) - 1) /* * Helper macro to return string length if we caught the zero * byte. */ #define testbyte(x) \ do { \ if (p[x] == '\0') \ return (p - str + x); \ } while (0) size_t strlen(const char *str) { const char *p; const unsigned long *lp; long va, vb; /* * Before trying the hard (unaligned byte-by-byte access) way * to figure out whether there is a nul character, try to see * if there is a nul character is within this accessible word * first. * * p and (p & ~LONGPTR_MASK) must be equally accessible since * they always fall in the same memory page, as long as page * boundaries is integral multiple of word size. */ lp = (const unsigned long *)((uintptr_t)str & ~LONGPTR_MASK); va = (*lp - mask01); vb = ((~*lp) & mask80); lp++; if (va & vb) /* Check if we have \0 in the first part */ for (p = str; p < (const char *)lp; p++) if (*p == '\0') return (p - str); /* Scan the rest of the string using word sized operation */ for (; ; lp++) { va = (*lp - mask01); vb = ((~*lp) & mask80); if (va & vb) { p = (const char *)(lp); testbyte(0); testbyte(1); testbyte(2); testbyte(3); #if (LONG_BIT >= 64) testbyte(4); testbyte(5); testbyte(6); testbyte(7); #endif } } /* NOTREACHED */ return (0); } Index: head/sys/libkern/strnlen.c =================================================================== --- head/sys/libkern/strnlen.c (revision 326270) +++ head/sys/libkern/strnlen.c (revision 326271) @@ -1,42 +1,44 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009 David Schultz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include size_t strnlen(const char *s, size_t maxlen) { size_t len; for (len = 0; len < maxlen; len++, s++) { if (!*s) break; } return (len); } Index: head/sys/libkern/strspn.c =================================================================== --- head/sys/libkern/strspn.c (revision 326270) +++ head/sys/libkern/strspn.c (revision 326271) @@ -1,71 +1,73 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 David Schultz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #define IDX(c) ((u_char)(c) / LONG_BIT) #define BIT(c) ((u_long)1 << ((u_char)(c) % LONG_BIT)) size_t strspn(const char *s, const char *charset) { /* * NB: idx and bit are temporaries whose use causes gcc 3.4.2 to * generate better code. Without them, gcc gets a little confused. */ const char *s1; u_long bit; u_long tbl[(UCHAR_MAX + 1) / LONG_BIT]; int idx; if(*s == '\0') return (0); #if LONG_BIT == 64 /* always better to unroll on 64-bit architectures */ tbl[3] = tbl[2] = tbl[1] = tbl[0] = 0; #else for (idx = 0; idx < sizeof(tbl) / sizeof(tbl[0]); idx++) tbl[idx] = 0; #endif for (; *charset != '\0'; charset++) { idx = IDX(*charset); bit = BIT(*charset); tbl[idx] |= bit; } for(s1 = s; ; s1++) { idx = IDX(*s1); bit = BIT(*s1); if ((tbl[idx] & bit) == 0) break; } return (s1 - s); } Index: head/sys/libkern/strvalid.c =================================================================== --- head/sys/libkern/strvalid.c (revision 326270) +++ head/sys/libkern/strvalid.c (revision 326271) @@ -1,53 +1,55 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by NAI Labs, * the Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include /* * Return (1) if the buffer pointed to by kernel pointer 'buffer' and * of length 'bufferlen' contains a valid NUL-terminated string */ int strvalid(const char *buffer, size_t bufferlen) { size_t i; /* Must be NUL-terminated. */ for (i = 0; i < bufferlen; i++) if (buffer[i] == '\0') return (1); return (0); }